From a7201369753851b8aaae0440f6afdd9ef350bec0 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Wed, 7 Aug 2024 10:30:02 -0700 Subject: [PATCH 01/47] cancellation related Signed-off-by: Kiran Prakash --- .../AbstractTaskSelectionStrategy.java | 81 +++++ .../cancellation/DefaultTaskCancellation.java | 218 +++++++++++ ...gestRunningTaskFirstSelectionStrategy.java | 29 ++ ...testRunningTaskFirstSelectionStrategy.java | 29 ++ .../cancellation/TaskSelectionStrategy.java | 32 ++ .../wlm/cancellation/package-info.java | 12 + .../DefaultTaskCancellationTests.java | 340 ++++++++++++++++++ ...skFirstStrategySelectionStrategyTests.java | 34 ++ ...skFirstStrategySelectionStrategyTests.java | 34 ++ .../TaskSelectionStrategyTests.java | 121 +++++++ 10 files changed, 930 insertions(+) create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/package-info.java create mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java create mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java create mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java create mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java new file mode 100644 index 0000000000000..4f592392a3d63 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.search.ResourceType; +import org.opensearch.tasks.CancellableTask; +import org.opensearch.tasks.Task; +import org.opensearch.tasks.TaskCancellation; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Represents an abstract task selection strategy. + * This class implements the TaskSelectionStrategy interface and provides a method to select tasks for cancellation based on a sorting condition. + * The specific sorting condition depends on the implementation. + */ +public abstract class AbstractTaskSelectionStrategy implements TaskSelectionStrategy { + + /** + * Returns a comparator that defines the sorting condition for tasks. + * The specific sorting condition depends on the implementation. + * + * @return The comparator + */ + public abstract Comparator sortingCondition(); + + /** + * Selects tasks for cancellation based on the provided limit and resource type. + * The tasks are sorted based on the sorting condition and then selected until the accumulated resource usage reaches the limit. + * + * @param tasks The list of tasks from which to select + * @param limit The limit on the accumulated resource usage + * @param resourceType The type of resource to consider + * @return The list of selected tasks + * @throws IllegalArgumentException If the limit is less than zero + */ + @Override + public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { + if (limit < 0) { + throw new IllegalArgumentException("reduceBy has to be greater than zero"); + } + if (limit == 0) { + return Collections.emptyList(); + } + + List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); + + List selectedTasks = new ArrayList<>(); + long accumulated = 0; + + for (Task task : sortedTasks) { + if (task instanceof CancellableTask) { + selectedTasks.add(createTaskCancellation((CancellableTask) task)); + accumulated += resourceType.getResourceUsage(task); + if (accumulated >= limit) { + break; + } + } + } + return selectedTasks; + } + + private TaskCancellation createTaskCancellation(CancellableTask task) { + // TODO add correct reason and callbacks + return new TaskCancellation(task, List.of(new TaskCancellation.Reason("limits exceeded", 5)), List.of(this::callbackOnCancel)); + } + + private void callbackOnCancel() { + // todo Implement callback logic here mostly used for Stats + } +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java new file mode 100644 index 0000000000000..d932d21e4affe --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -0,0 +1,218 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.monitor.jvm.JvmStats; +import org.opensearch.monitor.process.ProcessProbe; +import org.opensearch.search.ResourceType; +import org.opensearch.search.backpressure.settings.NodeDuressSettings; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; +import org.opensearch.tasks.TaskCancellation; +import org.opensearch.wlm.QueryGroupLevelResourceUsageView; + +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; + +/** + * Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria. + * This class utilizes a strategy pattern through {@link TaskSelectionStrategy} to identify tasks that exceed + * predefined resource usage limits and are therefore eligible for cancellation. + * + *

The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its + * resource limits. Tasks that contribute to exceeding these limits are selected for cancellation based on the + * implemented task selection strategy.

+ * + *

Instances of this class are configured with a map linking QueryGroup IDs to their corresponding resource usage + * views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the + * identification and cancellation of tasks that threaten to breach QueryGroup resource limits.

+ * + * @see TaskSelectionStrategy + * @see QueryGroup + * @see ResourceType + */ +public class DefaultTaskCancellation { + private static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); + + protected final TaskSelectionStrategy taskSelectionStrategy; + // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object + protected final Map queryGroupLevelResourceUsageViews; + protected final Set activeQueryGroups; + protected NodeDuressTrackers nodeDuressTrackers; + + public DefaultTaskCancellation( + TaskSelectionStrategy taskSelectionStrategy, + Map queryGroupLevelResourceUsageViews, + Set activeQueryGroups, + Settings settings, + ClusterSettings clusterSettings + ) { + this.taskSelectionStrategy = taskSelectionStrategy; + this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; + this.activeQueryGroups = activeQueryGroups; + this.nodeDuressTrackers = setupNodeDuressTracker(settings, clusterSettings); + } + + /** + * Cancel tasks based on the implemented strategy. + */ + public final void cancelTasks() { + cancelTasksForMode(QueryGroup.ResiliencyMode.ENFORCED); + + if (nodeDuressTrackers.isNodeInDuress()) { + cancelTasksForMode(QueryGroup.ResiliencyMode.SOFT); + } + } + + private void cancelTasksForMode(QueryGroup.ResiliencyMode resiliencyMode) { + List cancellableTasks = getAllCancellableTasksFrom(resiliencyMode); + for (TaskCancellation taskCancellation : cancellableTasks) { + taskCancellation.cancel(); + } + } + + /** + * Get all cancellable tasks from the QueryGroups. + * + * @return List of tasks that can be cancelled + */ + protected List getAllCancellableTasksFrom(QueryGroup.ResiliencyMode resiliencyMode) { + return getQueryGroupsToCancelFrom(resiliencyMode).stream() + .flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()) + .collect(Collectors.toList()); + } + + /** + * returns the list of QueryGroups breaching their resource limits. + * + * @return List of QueryGroups + */ + private List getQueryGroupsToCancelFrom(QueryGroup.ResiliencyMode resiliencyMode) { + final List queryGroupsToCancelFrom = new ArrayList<>(); + + for (QueryGroup queryGroup : this.activeQueryGroups) { + if (queryGroup.getResiliencyMode() != resiliencyMode) { + continue; + } + Map queryGroupResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) + .getResourceUsageData(); + + for (ResourceType resourceType : TRACKED_RESOURCES) { + if (queryGroup.getResourceLimits().containsKey(resourceType) && queryGroupResourceUsage.containsKey(resourceType)) { + Double resourceLimit = (Double) queryGroup.getResourceLimits().get(resourceType); + Long resourceUsage = queryGroupResourceUsage.get(resourceType); + + if (isBreachingThreshold(resourceType, resourceLimit, resourceUsage)) { + queryGroupsToCancelFrom.add(queryGroup); + break; + } + } + } + } + + return queryGroupsToCancelFrom; + } + + /** + * Get cancellable tasks from a specific queryGroup. + * + * @param queryGroup The QueryGroup from which to get cancellable tasks + * @return List of tasks that can be cancelled + */ + protected List getCancellableTasksFrom(QueryGroup queryGroup) { + return TRACKED_RESOURCES.stream() + .filter(resourceType -> shouldCancelTasks(queryGroup, resourceType)) + .flatMap(resourceType -> getTaskCancellations(queryGroup, resourceType).stream()) + .collect(Collectors.toList()); + } + + private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { + long reduceBy = getReduceBy(queryGroup, resourceType); + return reduceBy > 0; + } + + private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { + return taskSelectionStrategy.selectTasksForCancellation( + // get the active tasks in the query group + queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), + getReduceBy(queryGroup, resourceType), + resourceType + ); + } + + private long getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { + if (queryGroup.getResourceLimits().get(resourceType) == null) { + return 0; + } + Double threshold = (Double) queryGroup.getResourceLimits().get(resourceType); + return getResourceUsage(queryGroup, resourceType) - convertThresholdIntoLong(resourceType, threshold); + } + + private Long convertThresholdIntoLong(ResourceType resourceType, Double resourceThresholdInPercentage) { + Long threshold = null; + if (resourceType == ResourceType.MEMORY) { + // Check if resource usage is breaching the threshold + threshold = (long) (resourceThresholdInPercentage * HEAP_SIZE_BYTES); + } else if (resourceType == ResourceType.CPU) { + // Get the total CPU time of the process in milliseconds + long cpuTotalTimeInMillis = ProcessProbe.getInstance().getProcessCpuTotalTime(); + // Check if resource usage is breaching the threshold + threshold = (long) (resourceThresholdInPercentage * cpuTotalTimeInMillis); + } + return threshold; + } + + private Long getResourceUsage(QueryGroup queryGroup, ResourceType resourceType) { + if (!queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { + return 0L; + } + return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getResourceUsageData().get(resourceType); + } + + private boolean isBreachingThreshold(ResourceType resourceType, Double resourceThresholdInPercentage, long resourceUsage) { + if (resourceType == ResourceType.MEMORY) { + // Check if resource usage is breaching the threshold + return resourceUsage > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); + } + // Resource types should be CPU, resourceUsage is in nanoseconds, convert to milliseconds + long resourceUsageInMillis = resourceUsage / 1_000_000; + // Check if resource usage is breaching the threshold + return resourceUsageInMillis > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); + } + + private NodeDuressTrackers setupNodeDuressTracker(Settings settings, ClusterSettings clusterSettings) { + NodeDuressSettings nodeDuressSettings = new NodeDuressSettings(settings, clusterSettings); + return new NodeDuressTrackers(new EnumMap<>(ResourceType.class) { + { + put( + ResourceType.CPU, + new NodeDuressTrackers.NodeDuressTracker( + () -> ProcessProbe.getInstance().getProcessCpuPercent() / 100.0 >= nodeDuressSettings.getCpuThreshold(), + nodeDuressSettings::getNumSuccessiveBreaches + ) + ); + put( + ResourceType.MEMORY, + new NodeDuressTrackers.NodeDuressTracker( + () -> JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0 >= nodeDuressSettings.getHeapThreshold(), + nodeDuressSettings::getNumSuccessiveBreaches + ) + ); + } + }); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java new file mode 100644 index 0000000000000..d36d55b25bb4a --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.tasks.Task; + +import java.util.Comparator; + +/** + * Represents a task selection strategy that prioritizes the longest running tasks first. + */ +public class LongestRunningTaskFirstSelectionStrategy extends AbstractTaskSelectionStrategy { + + /** + * Returns a comparator that sorts tasks based on their start time in descending order. + * + * @return The comparator + */ + @Override + public Comparator sortingCondition() { + return Comparator.comparingLong(Task::getStartTime); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java new file mode 100644 index 0000000000000..1e8e75b291d05 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.tasks.Task; + +import java.util.Comparator; + +/** + * Represents a task selection strategy that prioritizes the shortest running tasks first. + */ +public class ShortestRunningTaskFirstSelectionStrategy extends AbstractTaskSelectionStrategy { + + /** + * Returns a comparator that sorts tasks based on their start time in ascending order. + * + * @return The comparator + */ + @Override + public Comparator sortingCondition() { + return Comparator.comparingLong(Task::getStartTime).reversed(); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java new file mode 100644 index 0000000000000..72161671186f2 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.search.ResourceType; +import org.opensearch.tasks.Task; +import org.opensearch.tasks.TaskCancellation; + +import java.util.List; + +/** + * Interface for strategies to select tasks for cancellation. + * Implementations of this interface define how tasks are selected for cancellation based on resource usage. + */ +public interface TaskSelectionStrategy { + /** + * Determines which tasks should be cancelled based on the provided criteria. + * + * @param tasks List of tasks available for cancellation. + * @param limit The amount of tasks to select whose resources reach this limit + * @param resourceType The type of resource that needs to be reduced, guiding the selection process. + * + * @return List of tasks that should be cancelled. + */ + List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType); +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java b/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java new file mode 100644 index 0000000000000..9618d22c9d5e2 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java @@ -0,0 +1,12 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * QueryGroup resource cancellation artifacts + */ +package org.opensearch.wlm.cancellation; diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java new file mode 100644 index 0000000000000..0c8f186ed425b --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -0,0 +1,340 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.action.search.SearchAction; +import org.opensearch.action.search.SearchTask; +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.tasks.TaskId; +import org.opensearch.search.ResourceType; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; +import org.opensearch.tasks.Task; +import org.opensearch.tasks.TaskCancellation; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.QueryGroupLevelResourceUsageView; +import org.junit.Before; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class DefaultTaskCancellationTests extends OpenSearchTestCase { + private static final String queryGroupId1 = "queryGroup1"; + private static final String queryGroupId2 = "queryGroup2"; + + private static class TestTaskCancellationImpl extends DefaultTaskCancellation { + + public TestTaskCancellationImpl( + TaskSelectionStrategy taskSelectionStrategy, + Map queryGroupLevelViews, + Set activeQueryGroups + ) { + super( + taskSelectionStrategy, + queryGroupLevelViews, + activeQueryGroups, + Settings.EMPTY, + new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + } + } + + private Map queryGroupLevelViews; + private Set activeQueryGroups; + private DefaultTaskCancellation taskCancellation; + + @Before + public void setup() { + queryGroupLevelViews = new HashMap<>(); + activeQueryGroups = new HashSet<>(); + taskCancellation = new TestTaskCancellationImpl( + new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups + ); + } + + public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { + ResourceType resourceType = ResourceType.CPU; + long usage = 100_000_000L; + Double threshold = 0.1; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + + List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + } + + public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMemory() { + ResourceType resourceType = ResourceType.MEMORY; + long usage = 900_000_000_000L; + Double threshold = 0.1; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + } + + public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold() { + ResourceType resourceType = ResourceType.CPU; + long usage = 500L; + Double threshold = 0.9; + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + assertTrue(cancellableTasksFrom.isEmpty()); + } + + public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000L; + Double threshold = 0.01; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups + ); + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.SOFT); + assertEquals(0, cancellableTasksFrom.size()); + } + + public void testCancelTasks_cancelsGivenTasks() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000L; + Double threshold = 0.01; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups + ); + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + + taskCancellation.cancelTasks(); + assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); + } + + public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000L; + Double threshold = 0.01; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroup queryGroup2 = new QueryGroup( + "testQueryGroup", + queryGroupId2, + QueryGroup.ResiliencyMode.SOFT, + Map.of(resourceType, threshold), + 1L + ); + + queryGroupLevelViews.put(queryGroupId1, createResourceUsageViewMock(resourceType, usage)); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(5678), getRandomSearchTask(8765))); + queryGroupLevelViews.put(queryGroupId2, mockView); + Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); + + TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups + ); + + NodeDuressTrackers mock = mock(NodeDuressTrackers.class); + when(mock.isNodeInDuress()).thenReturn(true); + taskCancellation.nodeDuressTrackers = mock; + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + + List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.SOFT); + assertEquals(2, cancellableTasksFrom1.size()); + assertEquals(5678, cancellableTasksFrom1.get(0).getTask().getId()); + assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); + + taskCancellation.cancelTasks(); + assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); + assertTrue(cancellableTasksFrom1.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFrom1.get(1).getTask().isCancelled()); + } + + public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresholds() { + ResourceType resourceType = ResourceType.CPU; + long usage = 1L; + Double threshold = 0.1; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + List allCancellableTasks = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + assertTrue(allCancellableTasks.isEmpty()); + } + + public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000L; + Double threshold = 0.01; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + + List allCancellableTasks = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, allCancellableTasks.size()); + assertEquals(1234, allCancellableTasks.get(0).getTask().getId()); + assertEquals(4321, allCancellableTasks.get(1).getTask().getId()); + } + + public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFound() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000_000L; + Double threshold = 0.01; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + QueryGroup queryGroup2 = new QueryGroup( + "testQueryGroup", + queryGroupId2, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + + queryGroupLevelViews.put(queryGroupId1, mockView); + activeQueryGroups.add(queryGroup1); + activeQueryGroups.add(queryGroup2); + + List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup2); + assertEquals(0, cancellableTasksFrom.size()); + } + + private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, Long usage) { + QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); + when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); + when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1234), getRandomSearchTask(4321))); + return mockView; + } + + private Task getRandomSearchTask(long id) { + return new SearchTask( + id, + "transport", + SearchAction.NAME, + () -> "test description", + new TaskId(randomLong() + ":" + randomLong()), + Collections.emptyMap() + ); + } +} diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java new file mode 100644 index 0000000000000..ad76a5021b175 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java @@ -0,0 +1,34 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.tasks.Task; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Arrays; +import java.util.List; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class LongestRunningTaskFirstStrategySelectionStrategyTests extends OpenSearchTestCase { + public void testSortingCondition() { + Task task1 = mock(Task.class); + Task task2 = mock(Task.class); + Task task3 = mock(Task.class); + when(task1.getStartTime()).thenReturn(100L); + when(task2.getStartTime()).thenReturn(200L); + when(task3.getStartTime()).thenReturn(300L); + + List tasks = Arrays.asList(task2, task1, task3); + tasks.sort(new LongestRunningTaskFirstSelectionStrategy().sortingCondition()); + + assertEquals(Arrays.asList(task1, task2, task3), tasks); + } +} diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java new file mode 100644 index 0000000000000..3c07df09f6f5e --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java @@ -0,0 +1,34 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.tasks.Task; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Arrays; +import java.util.List; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ShortestRunningTaskFirstStrategySelectionStrategyTests extends OpenSearchTestCase { + public void testSortingCondition() { + Task task1 = mock(Task.class); + Task task2 = mock(Task.class); + Task task3 = mock(Task.class); + when(task1.getStartTime()).thenReturn(100L); + when(task2.getStartTime()).thenReturn(200L); + when(task3.getStartTime()).thenReturn(300L); + + List tasks = Arrays.asList(task1, task3, task2); + tasks.sort(new ShortestRunningTaskFirstSelectionStrategy().sortingCondition()); + + assertEquals(Arrays.asList(task3, task2, task1), tasks); + } +} diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java new file mode 100644 index 0000000000000..43ccbd0920068 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java @@ -0,0 +1,121 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.action.search.SearchAction; +import org.opensearch.action.search.SearchTask; +import org.opensearch.core.tasks.TaskId; +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; +import org.opensearch.core.tasks.resourcetracker.ResourceUsageMetric; +import org.opensearch.search.ResourceType; +import org.opensearch.tasks.Task; +import org.opensearch.tasks.TaskCancellation; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +public class TaskSelectionStrategyTests extends OpenSearchTestCase { + + public static class TestTaskSelectionStrategy extends AbstractTaskSelectionStrategy { + @Override + public Comparator sortingCondition() { + return Comparator.comparingLong(Task::getId); + } + } + + public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { + TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + long threshold = 100L; + long reduceBy = 50L; + ResourceType resourceType = ResourceType.MEMORY; + List tasks = getListOfTasks(threshold); + + List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + assertFalse(selectedTasks.isEmpty()); + assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); + } + + public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { + TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + long threshold = 100L; + long reduceBy = -50L; + ResourceType resourceType = ResourceType.MEMORY; + List tasks = getListOfTasks(threshold); + + try { + testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + } catch (Exception e) { + assertTrue(e instanceof IllegalArgumentException); + assertEquals("reduceBy has to be greater than zero", e.getMessage()); + } + } + + public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { + TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + long threshold = 100L; + long reduceBy = 0; + ResourceType resourceType = ResourceType.MEMORY; + List tasks = getListOfTasks(threshold); + + List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + assertTrue(selectedTasks.isEmpty()); + } + + private boolean tasksUsageMeetsThreshold(List selectedTasks, long threshold) { + long memory = 0; + for (TaskCancellation task : selectedTasks) { + memory += task.getTask().getTotalResourceUtilization(ResourceStats.MEMORY); + if (memory > threshold) { + return true; + } + } + return false; + } + + private List getListOfTasks(long totalMemory) { + List tasks = new ArrayList<>(); + + while (totalMemory > 0) { + long id = randomLong(); + final Task task = getRandomSearchTask(id); + long initial_memory = randomLongBetween(1, 100); + + ResourceUsageMetric[] initialTaskResourceMetrics = new ResourceUsageMetric[] { + new ResourceUsageMetric(ResourceStats.MEMORY, initial_memory) }; + task.startThreadResourceTracking(id, ResourceStatsType.WORKER_STATS, initialTaskResourceMetrics); + + long memory = initial_memory + randomLongBetween(1, 10000); + + totalMemory -= memory - initial_memory; + + ResourceUsageMetric[] taskResourceMetrics = new ResourceUsageMetric[] { + new ResourceUsageMetric(ResourceStats.MEMORY, memory), }; + task.updateThreadResourceStats(id, ResourceStatsType.WORKER_STATS, taskResourceMetrics); + task.stopThreadResourceTracking(id, ResourceStatsType.WORKER_STATS); + tasks.add(task); + } + + return tasks; + } + + private Task getRandomSearchTask(long id) { + return new SearchTask( + id, + "transport", + SearchAction.NAME, + () -> "test description", + new TaskId(randomLong() + ":" + randomLong()), + Collections.emptyMap() + ); + } +} From 83e20c0228d016d92ba46c39fac2d53c54f8dcbb Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Wed, 7 Aug 2024 10:32:04 -0700 Subject: [PATCH 02/47] Update CHANGELOG.md Signed-off-by: Kiran Prakash --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dff44ed96dfd..c2811de099632 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add took time to request nodes stats ([#15054](https://github.com/opensearch-project/OpenSearch/pull/15054)) - [Workload Management] Add Get QueryGroup API Logic ([14709](https://github.com/opensearch-project/OpenSearch/pull/14709)) - [Workload Management] Add Settings for Workload Management feature ([#15028](https://github.com/opensearch-project/OpenSearch/pull/15028)) +- [Workload Management] QueryGroup resource cancellation framework changes ([#15151](https://github.com/opensearch-project/OpenSearch/pull/15151)) - [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897)) - Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774)) - Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153)) From 9983c734b356ef2eb75a34b3dcd4f598165e7472 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Wed, 7 Aug 2024 11:41:41 -0700 Subject: [PATCH 03/47] add better cancellation reason Signed-off-by: Kiran Prakash --- .../AbstractTaskSelectionStrategy.java | 34 +++++++--- .../cancellation/DefaultTaskCancellation.java | 1 + .../cancellation/TaskSelectionStrategy.java | 3 +- .../TaskSelectionStrategyTests.java | 62 ++++++++++++++++--- 4 files changed, 82 insertions(+), 18 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java index 4f592392a3d63..a2fdcc011885d 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java @@ -8,6 +8,7 @@ package org.opensearch.wlm.cancellation; +import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.search.ResourceType; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.Task; @@ -45,9 +46,14 @@ public abstract class AbstractTaskSelectionStrategy implements TaskSelectionStra * @throws IllegalArgumentException If the limit is less than zero */ @Override - public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { + public List selectTasksForCancellation( + QueryGroup querygroup, + List tasks, + long limit, + ResourceType resourceType + ) { if (limit < 0) { - throw new IllegalArgumentException("reduceBy has to be greater than zero"); + throw new IllegalArgumentException("limit has to be greater than zero"); } if (limit == 0) { return Collections.emptyList(); @@ -60,7 +66,8 @@ public List selectTasksForCancellation(List tasks, long for (Task task : sortedTasks) { if (task instanceof CancellableTask) { - selectedTasks.add(createTaskCancellation((CancellableTask) task)); + String cancellationReason = createCancellationReason(querygroup, resourceType); + selectedTasks.add(createTaskCancellation((CancellableTask) task, cancellationReason)); accumulated += resourceType.getResourceUsage(task); if (accumulated >= limit) { break; @@ -70,12 +77,25 @@ public List selectTasksForCancellation(List tasks, long return selectedTasks; } - private TaskCancellation createTaskCancellation(CancellableTask task) { - // TODO add correct reason and callbacks - return new TaskCancellation(task, List.of(new TaskCancellation.Reason("limits exceeded", 5)), List.of(this::callbackOnCancel)); + private String createCancellationReason(QueryGroup querygroup, ResourceType resourceType) { + Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); + return "[Workload Management] QueryGroup ID : " + + querygroup.get_id() + + " breached the resource limit of : " + + thresholdInPercent + + " for resource type : " + + resourceType.getName(); + } + + private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) { + return ((Double) (querygroup.getResourceLimits().get(resourceType))) * 100; + } + + private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) { + return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel)); } private void callbackOnCancel() { - // todo Implement callback logic here mostly used for Stats + // TODO Implement callback logic here mostly used for Stats } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index d932d21e4affe..f45e234177c28 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -147,6 +147,7 @@ private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceTy private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { return taskSelectionStrategy.selectTasksForCancellation( + queryGroup, // get the active tasks in the query group queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), getReduceBy(queryGroup, resourceType), diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java index 72161671186f2..1cb0fcf142ebd 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java @@ -8,6 +8,7 @@ package org.opensearch.wlm.cancellation; +import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.search.ResourceType; import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; @@ -28,5 +29,5 @@ public interface TaskSelectionStrategy { * * @return List of tasks that should be cancelled. */ - List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType); + List selectTasksForCancellation(QueryGroup queryGroup, List tasks, long limit, ResourceType resourceType); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java index 43ccbd0920068..f7349cbc628bd 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java @@ -10,6 +10,7 @@ import org.opensearch.action.search.SearchAction; import org.opensearch.action.search.SearchTask; +import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.TaskId; import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; @@ -23,6 +24,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.Map; public class TaskSelectionStrategyTests extends OpenSearchTestCase { @@ -35,39 +37,79 @@ public Comparator sortingCondition() { public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); - long threshold = 100L; + long thresholdInLong = 100L; + Double threshold = 0.1; long reduceBy = 50L; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(threshold); + List tasks = getListOfTasks(thresholdInLong); + + QueryGroup queryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); - List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation( + queryGroup, + tasks, + reduceBy, + resourceType + ); assertFalse(selectedTasks.isEmpty()); + assertEquals( + "[Workload Management] QueryGroup ID : queryGroupId1 breached the resource limit of : 10.0 for resource type : memory", + selectedTasks.get(0).getReasonString() + ); + assertEquals(5, selectedTasks.get(0).getReasons().get(0).getCancellationScore()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); - long threshold = 100L; + long thresholdInLong = 100L; + Double threshold = 0.1; long reduceBy = -50L; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(threshold); + List tasks = getListOfTasks(thresholdInLong); + QueryGroup queryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); try { - testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + testTaskSelectionStrategy.selectTasksForCancellation(queryGroup, tasks, reduceBy, resourceType); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); - assertEquals("reduceBy has to be greater than zero", e.getMessage()); + assertEquals("limit has to be greater than zero", e.getMessage()); } } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); - long threshold = 100L; + long thresholdInLong = 100L; + Double threshold = 0.1; long reduceBy = 0; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(threshold); + List tasks = getListOfTasks(thresholdInLong); + QueryGroup queryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); - List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation( + queryGroup, + tasks, + reduceBy, + resourceType + ); assertTrue(selectedTasks.isEmpty()); } From 245ee5d5e8b1485c7917c5a826eef7512578d5a5 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Wed, 7 Aug 2024 13:48:08 -0700 Subject: [PATCH 04/47] Update DefaultTaskCancellationTests.java Signed-off-by: Kiran Prakash --- .../wlm/cancellation/DefaultTaskCancellationTests.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index 0c8f186ed425b..46a2e4eed56b6 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -161,7 +161,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000L; + long usage = 150_000_000_000L; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -194,7 +194,7 @@ public void testCancelTasks_cancelsGivenTasks() { public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000L; + long usage = 150_000_000_000L; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -269,7 +269,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresho public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000L; + long usage = 150_000_000_000L; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( From 0771fd276fc561c4346afc9a20c7186e500272de Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Wed, 21 Aug 2024 11:28:11 -0700 Subject: [PATCH 05/47] refactor Signed-off-by: Kiran Prakash --- .../cancellation/DefaultTaskCancellation.java | 49 +++++-------------- ...java => DefaultTaskSelectionStrategy.java} | 11 +++-- ...gestRunningTaskFirstSelectionStrategy.java | 29 ----------- ...testRunningTaskFirstSelectionStrategy.java | 29 ----------- .../cancellation/TaskSelectionStrategy.java | 33 ------------- .../wlm/cancellation/package-info.java | 2 +- .../DefaultTaskCancellationTests.java | 41 +++++++--------- ...=> DefaultTaskSelectionStrategyTests.java} | 22 +++------ ...skFirstStrategySelectionStrategyTests.java | 34 ------------- ...skFirstStrategySelectionStrategyTests.java | 34 ------------- 10 files changed, 42 insertions(+), 242 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{AbstractTaskSelectionStrategy.java => DefaultTaskSelectionStrategy.java} (88%) delete mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java delete mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java delete mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java rename server/src/test/java/org/opensearch/wlm/cancellation/{TaskSelectionStrategyTests.java => DefaultTaskSelectionStrategyTests.java} (85%) delete mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java delete mode 100644 server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index f45e234177c28..d740d235ece73 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -9,28 +9,24 @@ package org.opensearch.wlm.cancellation; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.common.settings.ClusterSettings; -import org.opensearch.common.settings.Settings; import org.opensearch.monitor.jvm.JvmStats; import org.opensearch.monitor.process.ProcessProbe; import org.opensearch.search.ResourceType; -import org.opensearch.search.backpressure.settings.NodeDuressSettings; -import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import java.util.ArrayList; -import java.util.EnumMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.BooleanSupplier; import java.util.stream.Collectors; import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; /** * Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria. - * This class utilizes a strategy pattern through {@link TaskSelectionStrategy} to identify tasks that exceed + * This class utilizes a strategy pattern through {@link DefaultTaskSelectionStrategy} to identify tasks that exceed * predefined resource usage limits and are therefore eligible for cancellation. * *

The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its @@ -41,30 +37,29 @@ * views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the * identification and cancellation of tasks that threaten to breach QueryGroup resource limits.

* - * @see TaskSelectionStrategy + * @see DefaultTaskSelectionStrategy * @see QueryGroup * @see ResourceType */ public class DefaultTaskCancellation { private static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); - protected final TaskSelectionStrategy taskSelectionStrategy; + protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object protected final Map queryGroupLevelResourceUsageViews; protected final Set activeQueryGroups; - protected NodeDuressTrackers nodeDuressTrackers; + protected BooleanSupplier isNodeInDuress; public DefaultTaskCancellation( - TaskSelectionStrategy taskSelectionStrategy, + DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelResourceUsageViews, Set activeQueryGroups, - Settings settings, - ClusterSettings clusterSettings + BooleanSupplier isNodeInDuress ) { - this.taskSelectionStrategy = taskSelectionStrategy; + this.defaultTaskSelectionStrategy = defaultTaskSelectionStrategy; this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; this.activeQueryGroups = activeQueryGroups; - this.nodeDuressTrackers = setupNodeDuressTracker(settings, clusterSettings); + this.isNodeInDuress = isNodeInDuress; } /** @@ -73,7 +68,7 @@ public DefaultTaskCancellation( public final void cancelTasks() { cancelTasksForMode(QueryGroup.ResiliencyMode.ENFORCED); - if (nodeDuressTrackers.isNodeInDuress()) { + if (isNodeInDuress.getAsBoolean()) { cancelTasksForMode(QueryGroup.ResiliencyMode.SOFT); } } @@ -146,7 +141,7 @@ private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceTy } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { - return taskSelectionStrategy.selectTasksForCancellation( + return defaultTaskSelectionStrategy.selectTasksForCancellation( queryGroup, // get the active tasks in the query group queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), @@ -194,26 +189,4 @@ private boolean isBreachingThreshold(ResourceType resourceType, Double resourceT // Check if resource usage is breaching the threshold return resourceUsageInMillis > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); } - - private NodeDuressTrackers setupNodeDuressTracker(Settings settings, ClusterSettings clusterSettings) { - NodeDuressSettings nodeDuressSettings = new NodeDuressSettings(settings, clusterSettings); - return new NodeDuressTrackers(new EnumMap<>(ResourceType.class) { - { - put( - ResourceType.CPU, - new NodeDuressTrackers.NodeDuressTracker( - () -> ProcessProbe.getInstance().getProcessCpuPercent() / 100.0 >= nodeDuressSettings.getCpuThreshold(), - nodeDuressSettings::getNumSuccessiveBreaches - ) - ); - put( - ResourceType.MEMORY, - new NodeDuressTrackers.NodeDuressTracker( - () -> JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0 >= nodeDuressSettings.getHeapThreshold(), - nodeDuressSettings::getNumSuccessiveBreaches - ) - ); - } - }); - } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java similarity index 88% rename from server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java rename to server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index a2fdcc011885d..05040080a387c 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/AbstractTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -22,18 +22,20 @@ /** * Represents an abstract task selection strategy. - * This class implements the TaskSelectionStrategy interface and provides a method to select tasks for cancellation based on a sorting condition. + * This class implements the DefaultTaskSelectionStrategy interface and provides a method to select tasks for cancellation based on a sorting condition. * The specific sorting condition depends on the implementation. */ -public abstract class AbstractTaskSelectionStrategy implements TaskSelectionStrategy { +public class DefaultTaskSelectionStrategy { /** * Returns a comparator that defines the sorting condition for tasks. - * The specific sorting condition depends on the implementation. + * This is the default implementation since the longest running tasks are the ones that consume the most resources. * * @return The comparator */ - public abstract Comparator sortingCondition(); + public Comparator sortingCondition() { + return Comparator.comparingLong(Task::getStartTime); + } /** * Selects tasks for cancellation based on the provided limit and resource type. @@ -45,7 +47,6 @@ public abstract class AbstractTaskSelectionStrategy implements TaskSelectionStra * @return The list of selected tasks * @throws IllegalArgumentException If the limit is less than zero */ - @Override public List selectTasksForCancellation( QueryGroup querygroup, List tasks, diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java deleted file mode 100644 index d36d55b25bb4a..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstSelectionStrategy.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.cancellation; - -import org.opensearch.tasks.Task; - -import java.util.Comparator; - -/** - * Represents a task selection strategy that prioritizes the longest running tasks first. - */ -public class LongestRunningTaskFirstSelectionStrategy extends AbstractTaskSelectionStrategy { - - /** - * Returns a comparator that sorts tasks based on their start time in descending order. - * - * @return The comparator - */ - @Override - public Comparator sortingCondition() { - return Comparator.comparingLong(Task::getStartTime); - } -} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java deleted file mode 100644 index 1e8e75b291d05..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstSelectionStrategy.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.cancellation; - -import org.opensearch.tasks.Task; - -import java.util.Comparator; - -/** - * Represents a task selection strategy that prioritizes the shortest running tasks first. - */ -public class ShortestRunningTaskFirstSelectionStrategy extends AbstractTaskSelectionStrategy { - - /** - * Returns a comparator that sorts tasks based on their start time in ascending order. - * - * @return The comparator - */ - @Override - public Comparator sortingCondition() { - return Comparator.comparingLong(Task::getStartTime).reversed(); - } -} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java deleted file mode 100644 index 1cb0fcf142ebd..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.cancellation; - -import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.search.ResourceType; -import org.opensearch.tasks.Task; -import org.opensearch.tasks.TaskCancellation; - -import java.util.List; - -/** - * Interface for strategies to select tasks for cancellation. - * Implementations of this interface define how tasks are selected for cancellation based on resource usage. - */ -public interface TaskSelectionStrategy { - /** - * Determines which tasks should be cancelled based on the provided criteria. - * - * @param tasks List of tasks available for cancellation. - * @param limit The amount of tasks to select whose resources reach this limit - * @param resourceType The type of resource that needs to be reduced, guiding the selection process. - * - * @return List of tasks that should be cancelled. - */ - List selectTasksForCancellation(QueryGroup queryGroup, List tasks, long limit, ResourceType resourceType); -} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java b/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java index 9618d22c9d5e2..1ce7b571e9a9c 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/package-info.java @@ -7,6 +7,6 @@ */ /** - * QueryGroup resource cancellation artifacts + * Workload management resource based cancellation artifacts */ package org.opensearch.wlm.cancellation; diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index 46a2e4eed56b6..384c143fe4cdc 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -11,11 +11,8 @@ import org.opensearch.action.search.SearchAction; import org.opensearch.action.search.SearchTask; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.common.settings.ClusterSettings; -import org.opensearch.common.settings.Settings; import org.opensearch.core.tasks.TaskId; import org.opensearch.search.ResourceType; -import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; @@ -28,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.BooleanSupplier; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -39,17 +37,12 @@ public class DefaultTaskCancellationTests extends OpenSearchTestCase { private static class TestTaskCancellationImpl extends DefaultTaskCancellation { public TestTaskCancellationImpl( - TaskSelectionStrategy taskSelectionStrategy, + DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelViews, - Set activeQueryGroups + Set activeQueryGroups, + BooleanSupplier isNodeInDuress ) { - super( - taskSelectionStrategy, - queryGroupLevelViews, - activeQueryGroups, - Settings.EMPTY, - new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) - ); + super(defaultTaskSelectionStrategy, queryGroupLevelViews, activeQueryGroups, isNodeInDuress); } } @@ -62,9 +55,10 @@ public void setup() { queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); taskCancellation = new TestTaskCancellationImpl( - new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + new DefaultTaskSelectionStrategy(), queryGroupLevelViews, - activeQueryGroups + activeQueryGroups, + () -> false ); } @@ -150,9 +144,10 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { activeQueryGroups.add(queryGroup1); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( - new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + new DefaultTaskSelectionStrategy(), queryGroupLevelViews, - activeQueryGroups + activeQueryGroups, + () -> false ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.SOFT); @@ -177,9 +172,10 @@ public void testCancelTasks_cancelsGivenTasks() { activeQueryGroups.add(queryGroup1); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( - new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + new DefaultTaskSelectionStrategy(), queryGroupLevelViews, - activeQueryGroups + activeQueryGroups, + () -> false ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); @@ -220,15 +216,12 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( - new TaskSelectionStrategyTests.TestTaskSelectionStrategy(), + new DefaultTaskSelectionStrategy(), queryGroupLevelViews, - activeQueryGroups + activeQueryGroups, + () -> true ); - NodeDuressTrackers mock = mock(NodeDuressTrackers.class); - when(mock.isNodeInDuress()).thenReturn(true); - taskCancellation.nodeDuressTrackers = mock; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java similarity index 85% rename from server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index f7349cbc628bd..2258f2e63fb74 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -22,21 +22,13 @@ import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.List; import java.util.Map; -public class TaskSelectionStrategyTests extends OpenSearchTestCase { - - public static class TestTaskSelectionStrategy extends AbstractTaskSelectionStrategy { - @Override - public Comparator sortingCondition() { - return Comparator.comparingLong(Task::getId); - } - } +public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { - TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; Double threshold = 0.1; long reduceBy = 50L; @@ -51,7 +43,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea 1L ); - List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation( + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( queryGroup, tasks, reduceBy, @@ -67,7 +59,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { - TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; Double threshold = 0.1; long reduceBy = -50L; @@ -82,7 +74,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess ); try { - testTaskSelectionStrategy.selectTasksForCancellation(queryGroup, tasks, reduceBy, resourceType); + testDefaultTaskSelectionStrategy.selectTasksForCancellation(queryGroup, tasks, reduceBy, resourceType); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); assertEquals("limit has to be greater than zero", e.getMessage()); @@ -90,7 +82,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { - TaskSelectionStrategy testTaskSelectionStrategy = new TestTaskSelectionStrategy(); + DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; Double threshold = 0.1; long reduceBy = 0; @@ -104,7 +96,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua 1L ); - List selectedTasks = testTaskSelectionStrategy.selectTasksForCancellation( + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( queryGroup, tasks, reduceBy, diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java deleted file mode 100644 index ad76a5021b175..0000000000000 --- a/server/src/test/java/org/opensearch/wlm/cancellation/LongestRunningTaskFirstStrategySelectionStrategyTests.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.cancellation; - -import org.opensearch.tasks.Task; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.Arrays; -import java.util.List; - -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class LongestRunningTaskFirstStrategySelectionStrategyTests extends OpenSearchTestCase { - public void testSortingCondition() { - Task task1 = mock(Task.class); - Task task2 = mock(Task.class); - Task task3 = mock(Task.class); - when(task1.getStartTime()).thenReturn(100L); - when(task2.getStartTime()).thenReturn(200L); - when(task3.getStartTime()).thenReturn(300L); - - List tasks = Arrays.asList(task2, task1, task3); - tasks.sort(new LongestRunningTaskFirstSelectionStrategy().sortingCondition()); - - assertEquals(Arrays.asList(task1, task2, task3), tasks); - } -} diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java deleted file mode 100644 index 3c07df09f6f5e..0000000000000 --- a/server/src/test/java/org/opensearch/wlm/cancellation/ShortestRunningTaskFirstStrategySelectionStrategyTests.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.cancellation; - -import org.opensearch.tasks.Task; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.Arrays; -import java.util.List; - -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class ShortestRunningTaskFirstStrategySelectionStrategyTests extends OpenSearchTestCase { - public void testSortingCondition() { - Task task1 = mock(Task.class); - Task task2 = mock(Task.class); - Task task3 = mock(Task.class); - when(task1.getStartTime()).thenReturn(100L); - when(task2.getStartTime()).thenReturn(200L); - when(task3.getStartTime()).thenReturn(300L); - - List tasks = Arrays.asList(task1, task3, task2); - tasks.sort(new ShortestRunningTaskFirstSelectionStrategy().sortingCondition()); - - assertEquals(Arrays.asList(task3, task2, task1), tasks); - } -} From 4b1ef810a5244c0fd3da86896ba2cb3fb107e959 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Mon, 26 Aug 2024 02:53:04 -0700 Subject: [PATCH 06/47] refactor Signed-off-by: Kiran Prakash --- .../cancellation/DefaultTaskCancellation.java | 61 ++++++++++---- .../DefaultTaskSelectionStrategy.java | 34 +++++++- .../DefaultTaskCancellationTests.java | 80 +++++++++++++++++-- .../DefaultTaskSelectionStrategyTests.java | 32 +++++++- 4 files changed, 178 insertions(+), 29 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index d740d235ece73..effc9113883a1 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -16,9 +16,9 @@ import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.function.BooleanSupplier; import java.util.stream.Collectors; @@ -47,18 +47,21 @@ public class DefaultTaskCancellation { protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object protected final Map queryGroupLevelResourceUsageViews; - protected final Set activeQueryGroups; + protected final Collection activeQueryGroups; + protected final Collection deletedQueryGroups; protected BooleanSupplier isNodeInDuress; public DefaultTaskCancellation( DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelResourceUsageViews, - Set activeQueryGroups, + Collection activeQueryGroups, + Collection deletedQueryGroups, BooleanSupplier isNodeInDuress ) { this.defaultTaskSelectionStrategy = defaultTaskSelectionStrategy; this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; this.activeQueryGroups = activeQueryGroups; + this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; } @@ -66,29 +69,32 @@ public DefaultTaskCancellation( * Cancel tasks based on the implemented strategy. */ public final void cancelTasks() { - cancelTasksForMode(QueryGroup.ResiliencyMode.ENFORCED); + // cancel tasks from QueryGroups that have been deleted + cancelTasksFromDeletedQueryGroups(); + // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits + cancelTasks(QueryGroup.ResiliencyMode.ENFORCED); if (isNodeInDuress.getAsBoolean()) { - cancelTasksForMode(QueryGroup.ResiliencyMode.SOFT); + cancelTasks(QueryGroup.ResiliencyMode.SOFT); } } - private void cancelTasksForMode(QueryGroup.ResiliencyMode resiliencyMode) { - List cancellableTasks = getAllCancellableTasksFrom(resiliencyMode); - for (TaskCancellation taskCancellation : cancellableTasks) { - taskCancellation.cancel(); - } + /** + * Get all cancellable tasks from the QueryGroups. + * + * @return List of tasks that can be cancelled + */ + protected List getAllCancellableTasks(QueryGroup.ResiliencyMode resiliencyMode) { + return getAllCancellableTasks(getQueryGroupsToCancelFrom(resiliencyMode)); } /** - * Get all cancellable tasks from the QueryGroups. + * Get all cancellable tasks from the given QueryGroups. * * @return List of tasks that can be cancelled */ - protected List getAllCancellableTasksFrom(QueryGroup.ResiliencyMode resiliencyMode) { - return getQueryGroupsToCancelFrom(resiliencyMode).stream() - .flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()) - .collect(Collectors.toList()); + protected List getAllCancellableTasks(Collection queryGroups) { + return queryGroups.stream().flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()).collect(Collectors.toList()); } /** @@ -122,6 +128,20 @@ private List getQueryGroupsToCancelFrom(QueryGroup.ResiliencyMode re return queryGroupsToCancelFrom; } + private void cancelTasks(QueryGroup.ResiliencyMode resiliencyMode) { + cancelTasks(getAllCancellableTasks(resiliencyMode)); + } + + private void cancelTasksFromDeletedQueryGroups() { + for (QueryGroup querygroup : this.deletedQueryGroups) { + cancelTasks(getTaskCancellationsForDeletedQueryGroup(querygroup)); + } + } + + private void cancelTasks(List cancellableTasks) { + cancellableTasks.forEach(TaskCancellation::cancel); + } + /** * Get cancellable tasks from a specific queryGroup. * @@ -136,8 +156,7 @@ protected List getCancellableTasksFrom(QueryGroup queryGroup) } private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { - long reduceBy = getReduceBy(queryGroup, resourceType); - return reduceBy > 0; + return getReduceBy(queryGroup, resourceType) > 0; } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { @@ -150,6 +169,14 @@ private List getTaskCancellations(QueryGroup queryGroup, Resou ); } + protected List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { + return defaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup( + queryGroup, + // get the active tasks in the query group + queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks() + ); + } + private long getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { if (queryGroup.getResourceLimits().get(resourceType) == null) { return 0; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index 05040080a387c..50de62dc2d198 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -67,7 +67,7 @@ public List selectTasksForCancellation( for (Task task : sortedTasks) { if (task instanceof CancellableTask) { - String cancellationReason = createCancellationReason(querygroup, resourceType); + String cancellationReason = createCancellationReason(querygroup, task, resourceType); selectedTasks.add(createTaskCancellation((CancellableTask) task, cancellationReason)); accumulated += resourceType.getResourceUsage(task); if (accumulated >= limit) { @@ -78,9 +78,37 @@ public List selectTasksForCancellation( return selectedTasks; } - private String createCancellationReason(QueryGroup querygroup, ResourceType resourceType) { + /** + * Selects tasks for cancellation from deleted query group. + * + * This method iterates over the provided list of tasks and selects those that are instances of + * {@link CancellableTask}. For each selected task, it creates a cancellation reason and adds + * a {@link TaskCancellation} object to the list of selected tasks. + * + * @param querygroup The {@link QueryGroup} from which the tasks are being selected. + * @param tasks The list of {@link Task} objects to be evaluated for cancellation. + * @return A list of {@link TaskCancellation} objects representing the tasks selected for cancellation. + */ + public List selectTasksFromDeletedQueryGroup(QueryGroup querygroup, List tasks) { + List selectedTasks = new ArrayList<>(); + + for (Task task : tasks) { + if (task instanceof CancellableTask) { + String cancellationReason = "[Workload Management] Cancelling Task ID : " + + task.getId() + + " from QueryGroup ID : " + + querygroup.get_id(); + selectedTasks.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + } + } + return selectedTasks; + } + + private String createCancellationReason(QueryGroup querygroup, Task task, ResourceType resourceType) { Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); - return "[Workload Management] QueryGroup ID : " + return "[Workload Management] Cancelling Task ID : " + + task.getId() + + " from QueryGroup ID : " + querygroup.get_id() + " breached the resource limit of : " + thresholdInPercent diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index 384c143fe4cdc..f8e5e83becadc 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -40,24 +40,28 @@ public TestTaskCancellationImpl( DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelViews, Set activeQueryGroups, + Set deletedQueryGroups, BooleanSupplier isNodeInDuress ) { - super(defaultTaskSelectionStrategy, queryGroupLevelViews, activeQueryGroups, isNodeInDuress); + super(defaultTaskSelectionStrategy, queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, isNodeInDuress); } } private Map queryGroupLevelViews; private Set activeQueryGroups; + private Set deletedQueryGroups; private DefaultTaskCancellation taskCancellation; @Before public void setup() { queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); + deletedQueryGroups = new HashSet<>(); taskCancellation = new TestTaskCancellationImpl( new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, + deletedQueryGroups, () -> false ); } @@ -100,7 +104,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -147,10 +151,11 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, + deletedQueryGroups, () -> false ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.SOFT); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.SOFT); assertEquals(0, cancellableTasksFrom.size()); } @@ -175,10 +180,11 @@ public void testCancelTasks_cancelsGivenTasks() { new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, + deletedQueryGroups, () -> false ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -188,6 +194,63 @@ public void testCancelTasks_cancelsGivenTasks() { assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); } + public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000_000L; + Double threshold = 0.01; + + QueryGroup activeQueryGroup = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroup deletedQueryGroup = new QueryGroup( + "testQueryGroup", + queryGroupId2, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, usage); + QueryGroupLevelResourceUsageView mockView2 = mock(QueryGroupLevelResourceUsageView.class); + when(mockView2.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1000), getRandomSearchTask(1001))); + queryGroupLevelViews.put(queryGroupId1, mockView1); + queryGroupLevelViews.put(queryGroupId2, mockView2); + activeQueryGroups.add(activeQueryGroup); + deletedQueryGroups.add(deletedQueryGroup); + + TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + new DefaultTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups, + deletedQueryGroups, + () -> false + ); + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + + List cancellableTasksFromDeletedQueryGroups = taskCancellation.getTaskCancellationsForDeletedQueryGroup( + deletedQueryGroup + ); + assertEquals(2, cancellableTasksFromDeletedQueryGroups.size()); + assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); + assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); + + taskCancellation.cancelTasks(); + + assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); + assertTrue(cancellableTasksFromDeletedQueryGroups.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFromDeletedQueryGroups.get(1).getTask().isCancelled()); + } + public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; long usage = 150_000_000_000L; @@ -219,15 +282,16 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, + deletedQueryGroups, () -> true ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.SOFT); + List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.SOFT); assertEquals(2, cancellableTasksFrom1.size()); assertEquals(5678, cancellableTasksFrom1.get(0).getTask().getId()); assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); @@ -256,7 +320,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresho queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List allCancellableTasks = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); assertTrue(allCancellableTasks.isEmpty()); } @@ -277,7 +341,7 @@ public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List allCancellableTasks = taskCancellation.getAllCancellableTasksFrom(QueryGroup.ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); assertEquals(2, allCancellableTasks.size()); assertEquals(1234, allCancellableTasks.get(0).getTask().getId()); assertEquals(4321, allCancellableTasks.get(1).getTask().getId()); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index 2258f2e63fb74..361f52a3b2e38 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -27,6 +27,34 @@ public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { + public void testSelectTasksFromDeletedQueryGroup() { + DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); + + long thresholdInLong = 100L; + Double threshold = Double.MIN_VALUE; + long reduceBy = Long.MIN_VALUE; + ResourceType resourceType = ResourceType.MEMORY; + List tasks = getListOfTasks(thresholdInLong); + + QueryGroup queryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(queryGroup, tasks); + + assertFalse(selectedTasks.isEmpty()); + assertEquals( + "[Workload Management] Cancelling Task ID : " + selectedTasks.get(0).getTask().getId() + " from QueryGroup ID : queryGroupId1", + selectedTasks.get(0).getReasonString() + ); + assertEquals(5, selectedTasks.get(0).getReasons().get(0).getCancellationScore()); + assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); + } + public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; @@ -51,7 +79,9 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea ); assertFalse(selectedTasks.isEmpty()); assertEquals( - "[Workload Management] QueryGroup ID : queryGroupId1 breached the resource limit of : 10.0 for resource type : memory", + "[Workload Management] Cancelling Task ID : " + + selectedTasks.get(0).getTask().getId() + + " from QueryGroup ID : queryGroupId1 breached the resource limit of : 10.0 for resource type : memory", selectedTasks.get(0).getReasonString() ); assertEquals(5, selectedTasks.get(0).getReasons().get(0).getCancellationScore()); From 3ea44d75dde8cf417cf1fce33133fb849f407c6e Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Tue, 27 Aug 2024 07:26:55 +0530 Subject: [PATCH 07/47] Update DefaultTaskCancellation.java Signed-off-by: Kiran Prakash --- .../cancellation/DefaultTaskCancellation.java | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index effc9113883a1..e74849c084f48 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -20,6 +20,7 @@ import java.util.List; import java.util.Map; import java.util.function.BooleanSupplier; +import java.util.function.Consumer; import java.util.stream.Collectors; import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; @@ -69,16 +70,34 @@ public DefaultTaskCancellation( * Cancel tasks based on the implemented strategy. */ public final void cancelTasks() { - // cancel tasks from QueryGroups that have been deleted - cancelTasksFromDeletedQueryGroups(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits cancelTasks(QueryGroup.ResiliencyMode.ENFORCED); + // if the node is in duress, cancel tasks accordingly. + handleNodeDuress(); + } + + private void handleNodeDuress() { + if (!isNodeInDuress.getAsBoolean()) { + return; + } + // List of tasks to be executed in order if the node is in duress + List> duressActions = List.of( + v -> cancelTasksFromDeletedQueryGroups(), + v -> cancelTasks(QueryGroup.ResiliencyMode.SOFT) + ); - if (isNodeInDuress.getAsBoolean()) { - cancelTasks(QueryGroup.ResiliencyMode.SOFT); + for (Consumer duressAction : duressActions) { + if (!isNodeInDuress.getAsBoolean()) { + break; + } + duressAction.accept(null); } } + private void cancelTasksFromDeletedQueryGroups() { + cancelTasks(getAllCancellableTasks(this.deletedQueryGroups)); + } + /** * Get all cancellable tasks from the QueryGroups. * @@ -132,12 +151,6 @@ private void cancelTasks(QueryGroup.ResiliencyMode resiliencyMode) { cancelTasks(getAllCancellableTasks(resiliencyMode)); } - private void cancelTasksFromDeletedQueryGroups() { - for (QueryGroup querygroup : this.deletedQueryGroups) { - cancelTasks(getTaskCancellationsForDeletedQueryGroup(querygroup)); - } - } - private void cancelTasks(List cancellableTasks) { cancellableTasks.forEach(TaskCancellation::cancel); } From 0103089889f95c44eb63ea55b359df8caeb01879 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Thu, 29 Aug 2024 17:13:01 +0530 Subject: [PATCH 08/47] Update DefaultTaskCancellation.java Signed-off-by: Kiran Prakash --- .../org/opensearch/wlm/cancellation/DefaultTaskCancellation.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index e74849c084f48..ada0aefda6a18 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -219,6 +219,7 @@ private Long getResourceUsage(QueryGroup queryGroup, ResourceType resourceType) return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getResourceUsageData().get(resourceType); } + private boolean isBreachingThreshold(ResourceType resourceType, Double resourceThresholdInPercentage, long resourceUsage) { if (resourceType == ResourceType.MEMORY) { // Check if resource usage is breaching the threshold From 092d715bb48920c80e10395b11b8386e941cacab Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Thu, 29 Aug 2024 17:13:10 +0530 Subject: [PATCH 09/47] Update DefaultTaskCancellation.java Signed-off-by: Kiran Prakash --- .../org/opensearch/wlm/cancellation/DefaultTaskCancellation.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index ada0aefda6a18..e74849c084f48 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -219,7 +219,6 @@ private Long getResourceUsage(QueryGroup queryGroup, ResourceType resourceType) return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getResourceUsageData().get(resourceType); } - private boolean isBreachingThreshold(ResourceType resourceType, Double resourceThresholdInPercentage, long resourceUsage) { if (resourceType == ResourceType.MEMORY) { // Check if resource usage is breaching the threshold From 4a2c51ebd083475a7df13fd64c25dabf231f3e03 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Thu, 29 Aug 2024 17:19:29 +0530 Subject: [PATCH 10/47] Update DefaultTaskSelectionStrategy.java Signed-off-by: Kiran Prakash --- .../wlm/cancellation/DefaultTaskSelectionStrategy.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index 50de62dc2d198..a4c4234d37582 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -80,7 +80,6 @@ public List selectTasksForCancellation( /** * Selects tasks for cancellation from deleted query group. - * * This method iterates over the provided list of tasks and selects those that are instances of * {@link CancellableTask}. For each selected task, it creates a cancellation reason and adds * a {@link TaskCancellation} object to the list of selected tasks. From cbb51bdf3c6ff29671200e9e502e5caf44be7f84 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Thu, 29 Aug 2024 18:38:13 +0530 Subject: [PATCH 11/47] refactor Signed-off-by: Kiran Prakash --- .../cancellation/DefaultTaskCancellation.java | 49 +++++++- .../DefaultTaskSelectionStrategy.java | 53 ++------- .../DefaultTaskCancellationTests.java | 107 +++++++++++++++++- .../DefaultTaskSelectionStrategyTests.java | 69 ++--------- 4 files changed, 162 insertions(+), 116 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index e74849c084f48..d502ce0394c63 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -12,6 +12,8 @@ import org.opensearch.monitor.jvm.JvmStats; import org.opensearch.monitor.process.ProcessProbe; import org.opensearch.search.ResourceType; +import org.opensearch.tasks.CancellableTask; +import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; @@ -173,21 +175,52 @@ private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceTy } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { - return defaultTaskSelectionStrategy.selectTasksForCancellation( - queryGroup, - // get the active tasks in the query group + List selectedTasksToCancel = defaultTaskSelectionStrategy.selectTasksForCancellation( queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), getReduceBy(queryGroup, resourceType), resourceType ); + List taskCancellations = new ArrayList<>(); + for(Task task : selectedTasksToCancel) { + String cancellationReason = createCancellationReason(queryGroup, task, resourceType); + taskCancellations.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + } + return taskCancellations; + } + + private String createCancellationReason(QueryGroup querygroup, Task task, ResourceType resourceType) { + Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); + return "[Workload Management] Cancelling Task ID : " + + task.getId() + + " from QueryGroup ID : " + + querygroup.get_id() + + " breached the resource limit of : " + + thresholdInPercent + + " for resource type : " + + resourceType.getName(); + } + + private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) { + return ((Double) (querygroup.getResourceLimits().get(resourceType))) * 100; + } + + private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) { + return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel)); } protected List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { - return defaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup( - queryGroup, - // get the active tasks in the query group + List tasks = defaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup( queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks() ); + List taskCancellations = new ArrayList<>(); + for(Task task : tasks) { + String cancellationReason = "[Workload Management] Cancelling Task ID : " + + task.getId() + + " from QueryGroup ID : " + + queryGroup.get_id(); + taskCancellations.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + } + return taskCancellations; } private long getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { @@ -229,4 +262,8 @@ private boolean isBreachingThreshold(ResourceType resourceType, Double resourceT // Check if resource usage is breaching the threshold return resourceUsageInMillis > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); } + + private void callbackOnCancel() { + // TODO Implement callback logic here mostly used for Stats + } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index a4c4234d37582..124873647c2e5 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -8,7 +8,6 @@ package org.opensearch.wlm.cancellation; -import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.search.ResourceType; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.Task; @@ -47,8 +46,7 @@ public Comparator sortingCondition() { * @return The list of selected tasks * @throws IllegalArgumentException If the limit is less than zero */ - public List selectTasksForCancellation( - QueryGroup querygroup, + public List selectTasksForCancellation( List tasks, long limit, ResourceType resourceType @@ -62,13 +60,11 @@ public List selectTasksForCancellation( List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); - List selectedTasks = new ArrayList<>(); + List selectedTasks = new ArrayList<>(); long accumulated = 0; - for (Task task : sortedTasks) { if (task instanceof CancellableTask) { - String cancellationReason = createCancellationReason(querygroup, task, resourceType); - selectedTasks.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + selectedTasks.add(task); accumulated += resourceType.getResourceUsage(task); if (accumulated >= limit) { break; @@ -84,46 +80,13 @@ public List selectTasksForCancellation( * {@link CancellableTask}. For each selected task, it creates a cancellation reason and adds * a {@link TaskCancellation} object to the list of selected tasks. * - * @param querygroup The {@link QueryGroup} from which the tasks are being selected. * @param tasks The list of {@link Task} objects to be evaluated for cancellation. * @return A list of {@link TaskCancellation} objects representing the tasks selected for cancellation. */ - public List selectTasksFromDeletedQueryGroup(QueryGroup querygroup, List tasks) { - List selectedTasks = new ArrayList<>(); - - for (Task task : tasks) { - if (task instanceof CancellableTask) { - String cancellationReason = "[Workload Management] Cancelling Task ID : " - + task.getId() - + " from QueryGroup ID : " - + querygroup.get_id(); - selectedTasks.add(createTaskCancellation((CancellableTask) task, cancellationReason)); - } - } - return selectedTasks; - } - - private String createCancellationReason(QueryGroup querygroup, Task task, ResourceType resourceType) { - Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); - return "[Workload Management] Cancelling Task ID : " - + task.getId() - + " from QueryGroup ID : " - + querygroup.get_id() - + " breached the resource limit of : " - + thresholdInPercent - + " for resource type : " - + resourceType.getName(); - } - - private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) { - return ((Double) (querygroup.getResourceLimits().get(resourceType))) * 100; - } - - private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) { - return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel)); - } - - private void callbackOnCancel() { - // TODO Implement callback logic here mostly used for Stats + public List selectTasksFromDeletedQueryGroup(List tasks) { + return tasks + .stream() + .filter(task -> task instanceof CancellableTask) + .collect(Collectors.toList()); } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index f8e5e83becadc..a455478b27116 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -19,6 +19,7 @@ import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.junit.Before; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -26,6 +27,7 @@ import java.util.Map; import java.util.Set; import java.util.function.BooleanSupplier; +import java.util.stream.Collectors; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -66,6 +68,35 @@ public void setup() { ); } + public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { + ResourceType resourceType = ResourceType.CPU; + long usage = 100_000_000L; + Double threshold = 0.1; + + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + queryGroupLevelViews.put(queryGroupId1, mockView); + + List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + assertEquals( + "[Workload Management] Cancelling Task ID : " + + cancellableTasksFrom.get(0).getTask().getId() + + " from QueryGroup ID : queryGroup1" + + " breached the resource limit of : 10.0 for resource type : cpu", + cancellableTasksFrom.get(0).getReasonString() + ); + assertEquals(5, cancellableTasksFrom.get(0).getReasons().get(0).getCancellationScore()); + } + public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; long usage = 100_000_000L; @@ -216,8 +247,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { ); QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, usage); - QueryGroupLevelResourceUsageView mockView2 = mock(QueryGroupLevelResourceUsageView.class); - when(mockView2.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1000), getRandomSearchTask(1001))); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, usage, List.of(1000, 1001)); queryGroupLevelViews.put(queryGroupId1, mockView1); queryGroupLevelViews.put(queryGroupId2, mockView2); activeQueryGroups.add(activeQueryGroup); @@ -228,7 +258,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false + () -> true ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); @@ -251,6 +281,62 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { assertTrue(cancellableTasksFromDeletedQueryGroups.get(1).getTask().isCancelled()); } + public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeNotInDuress() { + ResourceType resourceType = ResourceType.CPU; + long usage = 150_000_000_000L; + Double threshold = 0.01; + + QueryGroup activeQueryGroup = new QueryGroup( + "testQueryGroup", + queryGroupId1, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroup deletedQueryGroup = new QueryGroup( + "testQueryGroup", + queryGroupId2, + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(resourceType, threshold), + 1L + ); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, usage); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, usage, List.of(1000, 1001)); + queryGroupLevelViews.put(queryGroupId1, mockView1); + queryGroupLevelViews.put(queryGroupId2, mockView2); + activeQueryGroups.add(activeQueryGroup); + deletedQueryGroups.add(deletedQueryGroup); + + TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + new DefaultTaskSelectionStrategy(), + queryGroupLevelViews, + activeQueryGroups, + deletedQueryGroups, + () -> false + ); + + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + assertEquals(2, cancellableTasksFrom.size()); + assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); + assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + + List cancellableTasksFromDeletedQueryGroups = taskCancellation.getTaskCancellationsForDeletedQueryGroup( + deletedQueryGroup + ); + assertEquals(2, cancellableTasksFromDeletedQueryGroups.size()); + assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); + assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); + + taskCancellation.cancelTasks(); + + assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); + assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); + assertFalse(cancellableTasksFromDeletedQueryGroups.get(0).getTask().isCancelled()); + assertFalse(cancellableTasksFromDeletedQueryGroups.get(1).getTask().isCancelled()); + } + public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; long usage = 150_000_000_000L; @@ -384,6 +470,21 @@ private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceTyp return mockView; } + private QueryGroupLevelResourceUsageView createResourceUsageViewMock( + ResourceType resourceType, + Long usage, + Collection ids + ) { + QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); + when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); + when(mockView.getActiveTasks()).thenReturn( + ids.stream() + .map(this::getRandomSearchTask) + .collect(Collectors.toList()) + ); + return mockView; + } + private Task getRandomSearchTask(long id) { return new SearchTask( id, diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index 361f52a3b2e38..9649a5dea0bb7 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -29,82 +29,37 @@ public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { public void testSelectTasksFromDeletedQueryGroup() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); - long thresholdInLong = 100L; - Double threshold = Double.MIN_VALUE; long reduceBy = Long.MIN_VALUE; - ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - - QueryGroup queryGroup = new QueryGroup( - "testQueryGroup", - "queryGroupId1", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), - 1L - ); - - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(queryGroup, tasks); - + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(tasks); assertFalse(selectedTasks.isEmpty()); - assertEquals( - "[Workload Management] Cancelling Task ID : " + selectedTasks.get(0).getTask().getId() + " from QueryGroup ID : queryGroupId1", - selectedTasks.get(0).getReasonString() - ); - assertEquals(5, selectedTasks.get(0).getReasons().get(0).getCancellationScore()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; - Double threshold = 0.1; long reduceBy = 50L; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - - QueryGroup queryGroup = new QueryGroup( - "testQueryGroup", - "queryGroupId1", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), - 1L - ); - - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( - queryGroup, + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( tasks, reduceBy, resourceType ); assertFalse(selectedTasks.isEmpty()); - assertEquals( - "[Workload Management] Cancelling Task ID : " - + selectedTasks.get(0).getTask().getId() - + " from QueryGroup ID : queryGroupId1 breached the resource limit of : 10.0 for resource type : memory", - selectedTasks.get(0).getReasonString() - ); - assertEquals(5, selectedTasks.get(0).getReasons().get(0).getCancellationScore()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; - Double threshold = 0.1; long reduceBy = -50L; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - QueryGroup queryGroup = new QueryGroup( - "testQueryGroup", - "queryGroupId1", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), - 1L - ); - try { - testDefaultTaskSelectionStrategy.selectTasksForCancellation(queryGroup, tasks, reduceBy, resourceType); + testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); assertEquals("limit has to be greater than zero", e.getMessage()); @@ -114,20 +69,10 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; - Double threshold = 0.1; long reduceBy = 0; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - QueryGroup queryGroup = new QueryGroup( - "testQueryGroup", - "queryGroupId1", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), - 1L - ); - - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( - queryGroup, + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( tasks, reduceBy, resourceType @@ -135,10 +80,10 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua assertTrue(selectedTasks.isEmpty()); } - private boolean tasksUsageMeetsThreshold(List selectedTasks, long threshold) { + private boolean tasksUsageMeetsThreshold(List selectedTasks, long threshold) { long memory = 0; - for (TaskCancellation task : selectedTasks) { - memory += task.getTask().getTotalResourceUtilization(ResourceStats.MEMORY); + for (Task task : selectedTasks) { + memory += task.getTotalResourceUtilization(ResourceStats.MEMORY); if (memory > threshold) { return true; } From 4e846e2f5766d7831c02ff7ee94fec93cccd9bd5 Mon Sep 17 00:00:00 2001 From: Kiran Prakash Date: Thu, 29 Aug 2024 22:02:27 +0530 Subject: [PATCH 12/47] refactor node level threshold Signed-off-by: Kiran Prakash --- .../cancellation/DefaultTaskCancellation.java | 16 +++++++--- .../DefaultTaskSelectionStrategy.java | 11 ++----- .../DefaultTaskCancellationTests.java | 32 ++++++++++++------- .../DefaultTaskSelectionStrategyTests.java | 15 ++------- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index d502ce0394c63..12e4fbc595e6d 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -16,6 +16,7 @@ import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; +import org.opensearch.wlm.WorkloadManagementSettings; import java.util.ArrayList; import java.util.Collection; @@ -47,6 +48,7 @@ public class DefaultTaskCancellation { private static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); + protected final WorkloadManagementSettings workloadManagementSettings; protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object protected final Map queryGroupLevelResourceUsageViews; @@ -55,12 +57,14 @@ public class DefaultTaskCancellation { protected BooleanSupplier isNodeInDuress; public DefaultTaskCancellation( + WorkloadManagementSettings workloadManagementSettings, DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelResourceUsageViews, Collection activeQueryGroups, Collection deletedQueryGroups, BooleanSupplier isNodeInDuress ) { + this.workloadManagementSettings = workloadManagementSettings; this.defaultTaskSelectionStrategy = defaultTaskSelectionStrategy; this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; this.activeQueryGroups = activeQueryGroups; @@ -181,7 +185,7 @@ private List getTaskCancellations(QueryGroup queryGroup, Resou resourceType ); List taskCancellations = new ArrayList<>(); - for(Task task : selectedTasksToCancel) { + for (Task task : selectedTasksToCancel) { String cancellationReason = createCancellationReason(queryGroup, task, resourceType); taskCancellations.add(createTaskCancellation((CancellableTask) task, cancellationReason)); } @@ -213,7 +217,7 @@ protected List getTaskCancellationsForDeletedQueryGroup(QueryG queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks() ); List taskCancellations = new ArrayList<>(); - for(Task task : tasks) { + for (Task task : tasks) { String cancellationReason = "[Workload Management] Cancelling Task ID : " + task.getId() + " from QueryGroup ID : " @@ -235,12 +239,16 @@ private Long convertThresholdIntoLong(ResourceType resourceType, Double resource Long threshold = null; if (resourceType == ResourceType.MEMORY) { // Check if resource usage is breaching the threshold - threshold = (long) (resourceThresholdInPercentage * HEAP_SIZE_BYTES); + double nodeLevelCancellationThreshold = this.workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() + * HEAP_SIZE_BYTES; + threshold = (long) (resourceThresholdInPercentage * nodeLevelCancellationThreshold); } else if (resourceType == ResourceType.CPU) { // Get the total CPU time of the process in milliseconds long cpuTotalTimeInMillis = ProcessProbe.getInstance().getProcessCpuTotalTime(); + double nodeLevelCancellationThreshold = this.workloadManagementSettings.getNodeLevelCpuCancellationThreshold() + * cpuTotalTimeInMillis; // Check if resource usage is breaching the threshold - threshold = (long) (resourceThresholdInPercentage * cpuTotalTimeInMillis); + threshold = (long) (resourceThresholdInPercentage * nodeLevelCancellationThreshold); } return threshold; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index 124873647c2e5..33b854ce5d760 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -46,11 +46,7 @@ public Comparator sortingCondition() { * @return The list of selected tasks * @throws IllegalArgumentException If the limit is less than zero */ - public List selectTasksForCancellation( - List tasks, - long limit, - ResourceType resourceType - ) { + public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { if (limit < 0) { throw new IllegalArgumentException("limit has to be greater than zero"); } @@ -84,9 +80,6 @@ public List selectTasksForCancellation( * @return A list of {@link TaskCancellation} objects representing the tasks selected for cancellation. */ public List selectTasksFromDeletedQueryGroup(List tasks) { - return tasks - .stream() - .filter(task -> task instanceof CancellableTask) - .collect(Collectors.toList()); + return tasks.stream().filter(task -> task instanceof CancellableTask).collect(Collectors.toList()); } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index a455478b27116..5c77c5c7f7a55 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -17,6 +17,7 @@ import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; +import org.opensearch.wlm.WorkloadManagementSettings; import org.junit.Before; import java.util.Collection; @@ -39,13 +40,21 @@ public class DefaultTaskCancellationTests extends OpenSearchTestCase { private static class TestTaskCancellationImpl extends DefaultTaskCancellation { public TestTaskCancellationImpl( + WorkloadManagementSettings workloadManagementSettings, DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, Map queryGroupLevelViews, Set activeQueryGroups, Set deletedQueryGroups, BooleanSupplier isNodeInDuress ) { - super(defaultTaskSelectionStrategy, queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, isNodeInDuress); + super( + workloadManagementSettings, + defaultTaskSelectionStrategy, + queryGroupLevelViews, + activeQueryGroups, + deletedQueryGroups, + isNodeInDuress + ); } } @@ -53,13 +62,16 @@ public TestTaskCancellationImpl( private Set activeQueryGroups; private Set deletedQueryGroups; private DefaultTaskCancellation taskCancellation; + private WorkloadManagementSettings workloadManagementSettings; @Before public void setup() { + workloadManagementSettings = mock(WorkloadManagementSettings.class); queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); deletedQueryGroups = new HashSet<>(); taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -156,6 +168,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); assertTrue(cancellableTasksFrom.isEmpty()); @@ -179,6 +192,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { activeQueryGroups.add(queryGroup1); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -208,6 +222,7 @@ public void testCancelTasks_cancelsGivenTasks() { activeQueryGroups.add(queryGroup1); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -254,6 +269,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { deletedQueryGroups.add(deletedQueryGroup); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -310,6 +326,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN deletedQueryGroups.add(deletedQueryGroup); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -365,6 +382,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + workloadManagementSettings, new DefaultTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, @@ -470,18 +488,10 @@ private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceTyp return mockView; } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock( - ResourceType resourceType, - Long usage, - Collection ids - ) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, Long usage, Collection ids) { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); - when(mockView.getActiveTasks()).thenReturn( - ids.stream() - .map(this::getRandomSearchTask) - .collect(Collectors.toList()) - ); + when(mockView.getActiveTasks()).thenReturn(ids.stream().map(this::getRandomSearchTask).collect(Collectors.toList())); return mockView; } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index 9649a5dea0bb7..fc2b7e42406cb 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -10,20 +10,17 @@ import org.opensearch.action.search.SearchAction; import org.opensearch.action.search.SearchTask; -import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.TaskId; import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; import org.opensearch.core.tasks.resourcetracker.ResourceUsageMetric; import org.opensearch.search.ResourceType; import org.opensearch.tasks.Task; -import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.Map; public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { @@ -43,11 +40,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea long reduceBy = 50L; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( - tasks, - reduceBy, - resourceType - ); + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); assertFalse(selectedTasks.isEmpty()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } @@ -72,11 +65,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua long reduceBy = 0; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation( - tasks, - reduceBy, - resourceType - ); + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); assertTrue(selectedTasks.isEmpty()); } From 7511d99e14c698cdf46f71e9cc8868080d4bccea Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 30 Aug 2024 15:19:50 -0700 Subject: [PATCH 13/47] use query group task Signed-off-by: Kaushal Kumar --- .../wlm/QueryGroupLevelResourceUsageView.java | 7 ++--- .../cancellation/DefaultTaskCancellation.java | 23 ++++++++------- .../DefaultTaskSelectionStrategy.java | 28 +++++++++---------- ...QueryGroupResourceUsageTrackerService.java | 8 +++--- ...QueryGroupLevelResourceUsageViewTests.java | 9 +++--- .../DefaultTaskSelectionStrategyTests.java | 23 +++++++-------- 6 files changed, 47 insertions(+), 51 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java index 7577c8573ec10..6cb73951ac6c0 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java @@ -8,7 +8,6 @@ package org.opensearch.wlm; -import org.opensearch.tasks.Task; import java.util.List; import java.util.Map; @@ -22,9 +21,9 @@ public class QueryGroupLevelResourceUsageView { // resourceUsage holds the resource usage data for a QueryGroup at a point in time private final Map resourceUsage; // activeTasks holds the list of active tasks for a QueryGroup at a point in time - private final List activeTasks; + private final List activeTasks; - public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { + public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { this.resourceUsage = resourceUsage; this.activeTasks = activeTasks; } @@ -43,7 +42,7 @@ public Map getResourceUsageData() { * * @return The list of active tasks */ - public List getActiveTasks() { + public List getActiveTasks() { return activeTasks; } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index 12e4fbc595e6d..14b454c788b83 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -11,9 +11,9 @@ import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.monitor.jvm.JvmStats; import org.opensearch.monitor.process.ProcessProbe; -import org.opensearch.search.ResourceType; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.CancellableTask; -import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.WorkloadManagementSettings; @@ -179,20 +179,20 @@ private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceTy } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { - List selectedTasksToCancel = defaultTaskSelectionStrategy.selectTasksForCancellation( + List selectedTasksToCancel = defaultTaskSelectionStrategy.selectTasksForCancellation( queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), getReduceBy(queryGroup, resourceType), resourceType ); List taskCancellations = new ArrayList<>(); - for (Task task : selectedTasksToCancel) { + for (QueryGroupTask task : selectedTasksToCancel) { String cancellationReason = createCancellationReason(queryGroup, task, resourceType); - taskCancellations.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + taskCancellations.add(createTaskCancellation(task, cancellationReason)); } return taskCancellations; } - private String createCancellationReason(QueryGroup querygroup, Task task, ResourceType resourceType) { + private String createCancellationReason(QueryGroup querygroup, QueryGroupTask task, ResourceType resourceType) { Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); return "[Workload Management] Cancelling Task ID : " + task.getId() @@ -213,16 +213,15 @@ private TaskCancellation createTaskCancellation(CancellableTask task, String can } protected List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { - List tasks = defaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup( - queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks() - ); + List tasks = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(); + List taskCancellations = new ArrayList<>(); - for (Task task : tasks) { + for (QueryGroupTask task : tasks) { String cancellationReason = "[Workload Management] Cancelling Task ID : " + task.getId() + " from QueryGroup ID : " + queryGroup.get_id(); - taskCancellations.add(createTaskCancellation((CancellableTask) task, cancellationReason)); + taskCancellations.add(createTaskCancellation(task, cancellationReason)); } return taskCancellations; } @@ -231,7 +230,7 @@ private long getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { if (queryGroup.getResourceLimits().get(resourceType) == null) { return 0; } - Double threshold = (Double) queryGroup.getResourceLimits().get(resourceType); + Double threshold = queryGroup.getResourceLimits().get(resourceType); return getResourceUsage(queryGroup, resourceType) - convertThresholdIntoLong(resourceType, threshold); } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index 33b854ce5d760..032eca5a8a12c 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -8,9 +8,9 @@ package org.opensearch.wlm.cancellation; -import org.opensearch.search.ResourceType; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.CancellableTask; -import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskCancellation; import java.util.ArrayList; @@ -32,8 +32,8 @@ public class DefaultTaskSelectionStrategy { * * @return The comparator */ - public Comparator sortingCondition() { - return Comparator.comparingLong(Task::getStartTime); + public Comparator sortingCondition() { + return Comparator.comparingLong(QueryGroupTask::getStartTime); } /** @@ -46,7 +46,7 @@ public Comparator sortingCondition() { * @return The list of selected tasks * @throws IllegalArgumentException If the limit is less than zero */ - public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { + public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { if (limit < 0) { throw new IllegalArgumentException("limit has to be greater than zero"); } @@ -54,17 +54,15 @@ public List selectTasksForCancellation(List tasks, long limit, Resou return Collections.emptyList(); } - List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); + List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); - List selectedTasks = new ArrayList<>(); + List selectedTasks = new ArrayList<>(); long accumulated = 0; - for (Task task : sortedTasks) { - if (task instanceof CancellableTask) { - selectedTasks.add(task); - accumulated += resourceType.getResourceUsage(task); - if (accumulated >= limit) { - break; - } + for (QueryGroupTask task : sortedTasks) { + selectedTasks.add(task); + accumulated += resourceType.getResourceUsage(task); + if (accumulated >= limit) { + break; } } return selectedTasks; @@ -79,7 +77,7 @@ public List selectTasksForCancellation(List tasks, long limit, Resou * @param tasks The list of {@link Task} objects to be evaluated for cancellation. * @return A list of {@link TaskCancellation} objects representing the tasks selected for cancellation. */ - public List selectTasksFromDeletedQueryGroup(List tasks) { + public List selectTasksFromDeletedQueryGroup(List tasks) { return tasks.stream().filter(task -> task instanceof CancellableTask).collect(Collectors.toList()); } } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index 15852b5bbe6a8..d83764ad2c602 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -44,11 +44,11 @@ public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskRes * @return Map of QueryGroup views */ public Map constructQueryGroupLevelUsageViews() { - final Map> tasksByQueryGroup = getTasksGroupedByQueryGroup(); + final Map> tasksByQueryGroup = getTasksGroupedByQueryGroup(); final Map queryGroupViews = new HashMap<>(); // Iterate over each QueryGroup entry - for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { + for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { // Compute the QueryGroup usage final EnumMap queryGroupUsage = new EnumMap<>(ResourceType.class); for (ResourceType resourceType : TRACKED_RESOURCES) { @@ -73,12 +73,12 @@ public Map constructQueryGroupLevelUsa * * @return Map of tasks grouped by QueryGroup */ - private Map> getTasksGroupedByQueryGroup() { + private Map> getTasksGroupedByQueryGroup() { return taskResourceTrackingService.getResourceAwareTasks() .values() .stream() .filter(QueryGroupTask.class::isInstance) .map(QueryGroupTask.class::cast) - .collect(Collectors.groupingBy(QueryGroupTask::getQueryGroupId, Collectors.mapping(task -> (Task) task, Collectors.toList()))); + .collect(Collectors.groupingBy(QueryGroupTask::getQueryGroupId, Collectors.mapping(task -> task, Collectors.toList()))); } } diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 532bf3de95bd6..2b7de064bfcee 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -10,7 +10,6 @@ import org.opensearch.action.search.SearchAction; import org.opensearch.core.tasks.TaskId; -import org.opensearch.tasks.Task; import org.opensearch.test.OpenSearchTestCase; import java.util.Collections; @@ -19,7 +18,7 @@ public class QueryGroupLevelResourceUsageViewTests extends OpenSearchTestCase { Map resourceUsage; - List activeTasks; + List activeTasks; public void setUp() throws Exception { super.setUp(); @@ -41,7 +40,7 @@ public void testGetActiveTasks() { resourceUsage, activeTasks ); - List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); + List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); assertEquals(1, activeTasks.size()); assertEquals(4321, activeTasks.get(0).getId()); } @@ -50,8 +49,8 @@ private boolean assertResourceUsageData(Map resourceUsageDat return resourceUsageData.get(ResourceType.fromName("memory")) == 34L && resourceUsageData.get(ResourceType.fromName("cpu")) == 12L; } - private Task getRandomTask(long id) { - return new Task( + private QueryGroupTask getRandomTask(long id) { + return new QueryGroupTask( id, "transport", SearchAction.NAME, diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index fc2b7e42406cb..7a59cb43ded5b 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -14,7 +14,8 @@ import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; import org.opensearch.core.tasks.resourcetracker.ResourceUsageMetric; -import org.opensearch.search.ResourceType; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.Task; import org.opensearch.test.OpenSearchTestCase; @@ -28,8 +29,8 @@ public void testSelectTasksFromDeletedQueryGroup() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; long reduceBy = Long.MIN_VALUE; - List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(tasks); + List tasks = getListOfTasks(thresholdInLong); + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(tasks); assertFalse(selectedTasks.isEmpty()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } @@ -39,8 +40,8 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea long thresholdInLong = 100L; long reduceBy = 50L; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List tasks = getListOfTasks(thresholdInLong); + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); assertFalse(selectedTasks.isEmpty()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } @@ -50,7 +51,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess long thresholdInLong = 100L; long reduceBy = -50L; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); + List tasks = getListOfTasks(thresholdInLong); try { testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); } catch (Exception e) { @@ -64,7 +65,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua long thresholdInLong = 100L; long reduceBy = 0; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); + List tasks = getListOfTasks(thresholdInLong); List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); assertTrue(selectedTasks.isEmpty()); } @@ -80,12 +81,12 @@ private boolean tasksUsageMeetsThreshold(List selectedTasks, long threshol return false; } - private List getListOfTasks(long totalMemory) { - List tasks = new ArrayList<>(); + private List getListOfTasks(long totalMemory) { + List tasks = new ArrayList<>(); while (totalMemory > 0) { long id = randomLong(); - final Task task = getRandomSearchTask(id); + final QueryGroupTask task = getRandomSearchTask(id); long initial_memory = randomLongBetween(1, 100); ResourceUsageMetric[] initialTaskResourceMetrics = new ResourceUsageMetric[] { @@ -106,7 +107,7 @@ private List getListOfTasks(long totalMemory) { return tasks; } - private Task getRandomSearchTask(long id) { + private QueryGroupTask getRandomSearchTask(long id) { return new SearchTask( id, "transport", From 498743adb3758f649e01ab9f879546ccdb6321df Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 3 Sep 2024 15:47:17 -0700 Subject: [PATCH 14/47] code clean up and refactorings Signed-off-by: Kaushal Kumar --- .../wlm/QueryGroupLevelResourceUsageView.java | 8 +- .../java/org/opensearch/wlm/ResourceType.java | 18 +-- .../cancellation/DefaultTaskCancellation.java | 65 ++------- .../DefaultTaskSelectionStrategy.java | 39 +++--- .../wlm/tracker/QueryGroupResourceUsage.java | 131 ++++++++++++++++++ ...QueryGroupResourceUsageTrackerService.java | 29 ++-- .../tracker/TaskResourceUsageCalculator.java | 60 ++++++++ ...QueryGroupLevelResourceUsageViewTests.java | 48 +++---- .../org/opensearch/wlm/ResourceTypeTests.java | 13 -- .../DefaultTaskCancellationTests.java | 107 ++++++++------ .../DefaultTaskSelectionStrategyTests.java | 36 +++-- ...GroupResourceUsageTrackerServiceTests.java | 36 ++++- 12 files changed, 386 insertions(+), 204 deletions(-) create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java index 6cb73951ac6c0..bb69acc66cb90 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java @@ -9,6 +9,8 @@ package org.opensearch.wlm; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage; + import java.util.List; import java.util.Map; @@ -19,11 +21,11 @@ */ public class QueryGroupLevelResourceUsageView { // resourceUsage holds the resource usage data for a QueryGroup at a point in time - private final Map resourceUsage; + private final Map resourceUsage; // activeTasks holds the list of active tasks for a QueryGroup at a point in time private final List activeTasks; - public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { + public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { this.resourceUsage = resourceUsage; this.activeTasks = activeTasks; } @@ -33,7 +35,7 @@ public QueryGroupLevelResourceUsageView(Map resourceUsage, L * * @return The map of resource usage data */ - public Map getResourceUsageData() { + public Map getResourceUsageData() { return resourceUsage; } diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index c3f48f5f793ce..961e06849e0fc 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -23,16 +23,14 @@ */ @PublicApi(since = "2.17.0") public enum ResourceType { - CPU("cpu", task -> task.getTotalResourceUtilization(ResourceStats.CPU), true), - MEMORY("memory", task -> task.getTotalResourceUtilization(ResourceStats.MEMORY), true); + CPU("cpu", true), + MEMORY("memory", true); private final String name; - private final Function getResourceUsage; private final boolean statsEnabled; - ResourceType(String name, Function getResourceUsage, boolean statsEnabled) { + ResourceType(String name, boolean statsEnabled) { this.name = name; - this.getResourceUsage = getResourceUsage; this.statsEnabled = statsEnabled; } @@ -58,16 +56,6 @@ public String getName() { return name; } - /** - * Gets the resource usage for a given resource type and task. - * - * @param task the task for which to calculate resource usage - * @return the resource usage - */ - public long getResourceUsage(Task task) { - return getResourceUsage.apply(task); - } - public boolean hasStatsEnabled() { return statsEnabled; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index 14b454c788b83..1995874e0ed71 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -9,14 +9,13 @@ package org.opensearch.wlm.cancellation; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.monitor.jvm.JvmStats; -import org.opensearch.monitor.process.ProcessProbe; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage; import java.util.ArrayList; import java.util.Collection; @@ -46,7 +45,7 @@ * @see ResourceType */ public class DefaultTaskCancellation { - private static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); + public static final double MIN_VALUE = 1e-9; protected final WorkloadManagementSettings workloadManagementSettings; protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy; @@ -134,18 +133,17 @@ private List getQueryGroupsToCancelFrom(QueryGroup.ResiliencyMode re if (queryGroup.getResiliencyMode() != resiliencyMode) { continue; } - Map queryGroupResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) + Map queryGroupResourcesUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) .getResourceUsageData(); for (ResourceType resourceType : TRACKED_RESOURCES) { - if (queryGroup.getResourceLimits().containsKey(resourceType) && queryGroupResourceUsage.containsKey(resourceType)) { - Double resourceLimit = (Double) queryGroup.getResourceLimits().get(resourceType); - Long resourceUsage = queryGroupResourceUsage.get(resourceType); - - if (isBreachingThreshold(resourceType, resourceLimit, resourceUsage)) { + if (queryGroup.getResourceLimits().containsKey(resourceType)) { + final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupResourcesUsage.get(resourceType); + if (queryGroupResourceUsage.isBreachingThresholdFor(queryGroup, workloadManagementSettings)) { queryGroupsToCancelFrom.add(queryGroup); break; } + } } } @@ -175,7 +173,7 @@ protected List getCancellableTasksFrom(QueryGroup queryGroup) } private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { - return getReduceBy(queryGroup, resourceType) > 0; + return getReduceBy(queryGroup, resourceType) > MIN_VALUE; } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { @@ -205,7 +203,7 @@ private String createCancellationReason(QueryGroup querygroup, QueryGroupTask ta } private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) { - return ((Double) (querygroup.getResourceLimits().get(resourceType))) * 100; + return querygroup.getResourceLimits().get(resourceType) * 100; } private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) { @@ -226,48 +224,13 @@ protected List getTaskCancellationsForDeletedQueryGroup(QueryG return taskCancellations; } - private long getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { - if (queryGroup.getResourceLimits().get(resourceType) == null) { + private double getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { + if (queryGroup.getResourceLimits().get(resourceType) == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { return 0; } - Double threshold = queryGroup.getResourceLimits().get(resourceType); - return getResourceUsage(queryGroup, resourceType) - convertThresholdIntoLong(resourceType, threshold); - } - - private Long convertThresholdIntoLong(ResourceType resourceType, Double resourceThresholdInPercentage) { - Long threshold = null; - if (resourceType == ResourceType.MEMORY) { - // Check if resource usage is breaching the threshold - double nodeLevelCancellationThreshold = this.workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() - * HEAP_SIZE_BYTES; - threshold = (long) (resourceThresholdInPercentage * nodeLevelCancellationThreshold); - } else if (resourceType == ResourceType.CPU) { - // Get the total CPU time of the process in milliseconds - long cpuTotalTimeInMillis = ProcessProbe.getInstance().getProcessCpuTotalTime(); - double nodeLevelCancellationThreshold = this.workloadManagementSettings.getNodeLevelCpuCancellationThreshold() - * cpuTotalTimeInMillis; - // Check if resource usage is breaching the threshold - threshold = (long) (resourceThresholdInPercentage * nodeLevelCancellationThreshold); - } - return threshold; - } - - private Long getResourceUsage(QueryGroup queryGroup, ResourceType resourceType) { - if (!queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { - return 0L; - } - return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getResourceUsageData().get(resourceType); - } - - private boolean isBreachingThreshold(ResourceType resourceType, Double resourceThresholdInPercentage, long resourceUsage) { - if (resourceType == ResourceType.MEMORY) { - // Check if resource usage is breaching the threshold - return resourceUsage > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); - } - // Resource types should be CPU, resourceUsage is in nanoseconds, convert to milliseconds - long resourceUsageInMillis = resourceUsage / 1_000_000; - // Check if resource usage is breaching the threshold - return resourceUsageInMillis > convertThresholdIntoLong(resourceType, resourceThresholdInPercentage); + final QueryGroupLevelResourceUsageView queryGroupLevelResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); + final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupLevelResourceUsage.getResourceUsageData().get(resourceType); + return queryGroupResourceUsage.getReduceByFor(queryGroup, workloadManagementSettings); } private void callbackOnCancel() { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java index 032eca5a8a12c..12798fba6e297 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java @@ -10,15 +10,17 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.tasks.CancellableTask; -import org.opensearch.tasks.TaskCancellation; +import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.function.Supplier; import java.util.stream.Collectors; +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; + /** * Represents an abstract task selection strategy. * This class implements the DefaultTaskSelectionStrategy interface and provides a method to select tasks for cancellation based on a sorting condition. @@ -26,6 +28,16 @@ */ public class DefaultTaskSelectionStrategy { + private final Supplier nanoTimeSupplier; + + public DefaultTaskSelectionStrategy() { + this(System::nanoTime); + } + + public DefaultTaskSelectionStrategy(Supplier nanoTimeSupplier) { + this.nanoTimeSupplier = nanoTimeSupplier; + } + /** * Returns a comparator that defines the sorting condition for tasks. * This is the default implementation since the longest running tasks are the ones that consume the most resources. @@ -46,38 +58,25 @@ public Comparator sortingCondition() { * @return The list of selected tasks * @throws IllegalArgumentException If the limit is less than zero */ - public List selectTasksForCancellation(List tasks, long limit, ResourceType resourceType) { + public List selectTasksForCancellation(List tasks, double limit, ResourceType resourceType) { if (limit < 0) { throw new IllegalArgumentException("limit has to be greater than zero"); } - if (limit == 0) { + if (limit < MIN_VALUE) { return Collections.emptyList(); } List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); List selectedTasks = new ArrayList<>(); - long accumulated = 0; + double accumulated = 0; for (QueryGroupTask task : sortedTasks) { selectedTasks.add(task); - accumulated += resourceType.getResourceUsage(task); - if (accumulated >= limit) { + accumulated += TaskResourceUsageCalculator.from(resourceType).calculateFor(task, nanoTimeSupplier); + if ((accumulated - limit) > MIN_VALUE) { break; } } return selectedTasks; } - - /** - * Selects tasks for cancellation from deleted query group. - * This method iterates over the provided list of tasks and selects those that are instances of - * {@link CancellableTask}. For each selected task, it creates a cancellation reason and adds - * a {@link TaskCancellation} object to the list of selected tasks. - * - * @param tasks The list of {@link Task} objects to be evaluated for cancellation. - * @return A list of {@link TaskCancellation} objects representing the tasks selected for cancellation. - */ - public List selectTasksFromDeletedQueryGroup(List tasks) { - return tasks.stream().filter(task -> task instanceof CancellableTask).collect(Collectors.toList()); - } } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java new file mode 100644 index 0000000000000..f848f78e39c1c --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java @@ -0,0 +1,131 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; + +import java.util.List; +import java.util.function.Supplier; + +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; + +/** + * This class is used to track query group level resource usage + */ +public abstract class QueryGroupResourceUsage { + private double currentUsage; + + + /** + * getter for value field + * @return resource usage value + */ + public double getCurrentUsage() { + return currentUsage; + } + + public void setCurrentUsage(double currentUsage) { + this.currentUsage = currentUsage; + } + + public static QueryGroupResourceUsage from(ResourceType resourceType) { + if (resourceType == ResourceType.CPU) { + return new QueryGroupCpuUsage(); + } else if (resourceType == ResourceType.MEMORY) { + return new QueryGroupMemoryUsage(); + } + throw new IllegalArgumentException("Invalid resource type: " + resourceType.getName() + ". It is currently not supported in wlm"); + } + + + /** + * Determines whether {@link QueryGroup} is breaching its threshold for the resource + * @param queryGroup + * @return whether the query group is breaching threshold for this resource + */ + public boolean isBreachingThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return getCurrentUsage() > getNormalisedThresholdFor(queryGroup, settings); + } + + /** + * returns the value by which the resource usage is beyond the configured limit for the query group + * @param queryGroup instance + * @param settings {@link WorkloadManagementSettings} instance + * @return the overshooting limit for the resource + */ + public double getReduceByFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return getCurrentUsage() - getNormalisedThresholdFor(queryGroup, settings); + } + + /** + * initialises the member variable currentUsage + * @param tasks list of tasks in the query group + * @param timeSupplier nano time supplier + */ + public void initialise(List tasks, Supplier timeSupplier) { + this.setCurrentUsage(this.calculateResourceUsage(tasks, timeSupplier)); + } + + /** + * normalises configured value with respect to node level cancellation thresholds + * @param queryGroup instance + * @param settings {@link WorkloadManagementSettings} instance + * @return normalised value with respect to node level cancellation thresholds + */ + public abstract double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings); + + /** + * calculates the current resource usage for the query group + * @param tasks list of tasks in the query group + * @param timeSupplier nano time supplier + */ + public abstract double calculateResourceUsage(List tasks, Supplier timeSupplier); + + /** + * class to store cpu usage for the query group + */ + public static class QueryGroupCpuUsage extends QueryGroupResourceUsage { + @Override + public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return settings.getNodeLevelCpuCancellationThreshold() * queryGroup.getResourceLimits().get(ResourceType.CPU); + } + + @Override + public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + double usage = tasks.stream().mapToDouble(task -> { + return TaskResourceUsageCalculator.from(ResourceType.CPU).calculateFor(task, timeSupplier); + }).sum(); + + usage /= PROCESSOR_COUNT; + return usage; + } + } + + /** + * class to store memory usage for the query group + */ + public static class QueryGroupMemoryUsage extends QueryGroupResourceUsage { + @Override + public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return settings.getNodeLevelMemoryCancellationThreshold() * queryGroup.getResourceLimits().get(ResourceType.MEMORY); + } + + @Override + public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + double usage = tasks.stream().mapToDouble(task -> { + return TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, timeSupplier); + }).sum(); + return usage; + } + } + +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index d83764ad2c602..a56faf8c0978d 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -8,34 +8,41 @@ package org.opensearch.wlm.tracker; +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.monitor.jvm.JvmStats; import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskResourceTrackingService; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import java.util.EnumMap; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.function.Supplier; import java.util.stream.Collectors; /** * This class tracks resource usage per QueryGroup */ public class QueryGroupResourceUsageTrackerService { - + public static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); + // This value should be initialised at the start time of the process and be used throughout the codebase + public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); public static final EnumSet TRACKED_RESOURCES = EnumSet.allOf(ResourceType.class); private final TaskResourceTrackingService taskResourceTrackingService; + private final Supplier nanoTimeSupplier; /** * QueryGroupResourceTrackerService constructor * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService) { + public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, + Supplier nanoTimeSupplier) { this.taskResourceTrackingService = taskResourceTrackingService; + this.nanoTimeSupplier = nanoTimeSupplier; } /** @@ -49,20 +56,18 @@ public Map constructQueryGroupLevelUsa // Iterate over each QueryGroup entry for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { - // Compute the QueryGroup usage - final EnumMap queryGroupUsage = new EnumMap<>(ResourceType.class); - for (ResourceType resourceType : TRACKED_RESOURCES) { - long queryGroupResourceUsage = 0; - for (Task task : queryGroupEntry.getValue()) { - queryGroupResourceUsage += resourceType.getResourceUsage(task); - } - queryGroupUsage.put(resourceType, queryGroupResourceUsage); + // Compute the QueryGroup resource usage + final Map resourceUsage = new HashMap<>(); + for (ResourceType resourceType: TRACKED_RESOURCES) { + final QueryGroupResourceUsage queryGroupResourceUsage = QueryGroupResourceUsage.from(resourceType); + queryGroupResourceUsage.initialise(queryGroupEntry.getValue(), nanoTimeSupplier); + resourceUsage.put(resourceType, queryGroupResourceUsage); } // Add to the QueryGroup View queryGroupViews.put( queryGroupEntry.getKey(), - new QueryGroupLevelResourceUsageView(queryGroupUsage, queryGroupEntry.getValue()) + new QueryGroupLevelResourceUsageView(resourceUsage, queryGroupEntry.getValue()) ); } return queryGroupViews; diff --git a/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java new file mode 100644 index 0000000000000..f2c6f6cd82a37 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java @@ -0,0 +1,60 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.tasks.Task; +import org.opensearch.wlm.ResourceType; + +import java.util.function.Supplier; + +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; + +/** + * Utility class to calculate task level resource usage + */ +public abstract class TaskResourceUsageCalculator { + public static TaskResourceUsageCalculator from(final ResourceType resourceType) { + if (resourceType == ResourceType.CPU) { + return new TaskCpuUsageCalculator(); + } else if (resourceType == ResourceType.MEMORY) { + return new TaskMemoryUsageCalculator(); + } + throw new IllegalArgumentException("Invalid resource type " + resourceType + " . It is not supported in wlm"); + } + + /** + * calculates the resource usage for the task + * @param task {@link Task} instance + * @param nanoTimeSupplier time supplier in nano second unit + * @return task resource usage + */ + public abstract double calculateFor(Task task, Supplier nanoTimeSupplier); + + /** + * This class will return per core cpu usage for a task + */ + public static class TaskCpuUsageCalculator extends TaskResourceUsageCalculator { + @Override + public double calculateFor(Task task, Supplier nanoTimeSupplier) { + return ((1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.get() - task.getStartTimeNanos())); + } + } + + /** + * This class will return allocated bytes by the task since task has been created + */ + public static class TaskMemoryUsageCalculator extends TaskResourceUsageCalculator { + @Override + public double calculateFor(Task task, Supplier nanoTimeSupplier) { + return (1.0f * task.getTotalResourceUtilization(ResourceStats.MEMORY)) / HEAP_SIZE_BYTES; + } + } +} + diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 2b7de064bfcee..4dfda5033bdb3 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -20,30 +20,30 @@ public class QueryGroupLevelResourceUsageViewTests extends OpenSearchTestCase { Map resourceUsage; List activeTasks; - public void setUp() throws Exception { - super.setUp(); - resourceUsage = Map.of(ResourceType.fromName("memory"), 34L, ResourceType.fromName("cpu"), 12L); - activeTasks = List.of(getRandomTask(4321)); - } - - public void testGetResourceUsageData() { - QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( - resourceUsage, - activeTasks - ); - Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); - assertTrue(assertResourceUsageData(resourceUsageData)); - } - - public void testGetActiveTasks() { - QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( - resourceUsage, - activeTasks - ); - List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); - assertEquals(1, activeTasks.size()); - assertEquals(4321, activeTasks.get(0).getId()); - } +// public void setUp() throws Exception { +// super.setUp(); +// resourceUsage = Map.of(ResourceType.fromName("memory"), 34L, ResourceType.fromName("cpu"), 12L); +// activeTasks = List.of(getRandomTask(4321)); +// } +// +// public void testGetResourceUsageData() { +// QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( +// resourceUsage, +// activeTasks +// ); +// Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); +// assertTrue(assertResourceUsageData(resourceUsageData)); +// } + +// public void testGetActiveTasks() { +// QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( +// resourceUsage, +// activeTasks +// ); +// List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); +// assertEquals(1, activeTasks.size()); +// assertEquals(4321, activeTasks.get(0).getId()); +// } private boolean assertResourceUsageData(Map resourceUsageData) { return resourceUsageData.get(ResourceType.fromName("memory")) == 34L && resourceUsageData.get(ResourceType.fromName("cpu")) == 12L; diff --git a/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java b/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java index 737cbb37b554c..2bbf7b529b14b 100644 --- a/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java +++ b/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java @@ -35,17 +35,4 @@ public void testGetName() { assertEquals("cpu", ResourceType.CPU.getName()); assertEquals("memory", ResourceType.MEMORY.getName()); } - - public void testGetResourceUsage() { - SearchShardTask mockTask = createMockTask(SearchShardTask.class, 100, 200); - assertEquals(100, ResourceType.CPU.getResourceUsage(mockTask)); - assertEquals(200, ResourceType.MEMORY.getResourceUsage(mockTask)); - } - - private T createMockTask(Class type, long cpuUsage, long heapUsage) { - T task = mock(type); - when(task.getTotalResourceUtilization(ResourceStats.CPU)).thenReturn(cpuUsage); - when(task.getTotalResourceUtilization(ResourceStats.MEMORY)).thenReturn(heapUsage); - return task; - } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index 5c77c5c7f7a55..9817f69b3be74 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -12,13 +12,17 @@ import org.opensearch.action.search.SearchTask; import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.TaskId; -import org.opensearch.search.ResourceType; -import org.opensearch.tasks.Task; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.WorkloadManagementSettings; import org.junit.Before; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupCpuUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; import java.util.Collection; import java.util.Collections; @@ -30,6 +34,7 @@ import java.util.function.BooleanSupplier; import java.util.stream.Collectors; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -37,6 +42,8 @@ public class DefaultTaskCancellationTests extends OpenSearchTestCase { private static final String queryGroupId1 = "queryGroup1"; private static final String queryGroupId2 = "queryGroup2"; + private TestClock clock; + private static class TestTaskCancellationImpl extends DefaultTaskCancellation { public TestTaskCancellationImpl( @@ -70,6 +77,7 @@ public void setup() { queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); deletedQueryGroups = new HashSet<>(); + clock = new TestClock(); taskCancellation = new TestTaskCancellationImpl( workloadManagementSettings, new DefaultTaskSelectionStrategy(), @@ -82,7 +90,7 @@ public void setup() { public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { ResourceType resourceType = ResourceType.CPU; - long usage = 100_000_000L; + QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -92,7 +100,8 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco Map.of(resourceType, threshold), 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + clock.fastForwardBy(1000); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.3); queryGroupLevelViews.put(queryGroupId1, mockView); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); @@ -111,7 +120,7 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - long usage = 100_000_000L; + QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -121,7 +130,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { Map.of(resourceType, threshold), 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.15); queryGroupLevelViews.put(queryGroupId1, mockView); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); @@ -132,7 +141,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMemory() { ResourceType resourceType = ResourceType.MEMORY; - long usage = 900_000_000_000L; + QueryGroupMemoryUsage usage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -142,8 +151,11 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem Map.of(resourceType, threshold), 1L ); + when(usage.getCurrentUsage()).thenReturn(0.15); + when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.05); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -155,7 +167,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - long usage = 500L; + QueryGroupMemoryUsage usage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.9; QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", @@ -164,8 +176,8 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( Map.of(resourceType, threshold), 1L ); - - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.0); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); @@ -176,7 +188,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000L; + QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -187,7 +199,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.1); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -206,7 +218,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -217,7 +229,8 @@ public void testCancelTasks_cancelsGivenTasks() { 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.005); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -242,7 +255,8 @@ public void testCancelTasks_cancelsGivenTasks() { public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage activeQueryGroupUsage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage deletedQueryGroupUsage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup activeQueryGroup = new QueryGroup( @@ -261,8 +275,11 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { 1L ); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, usage); - QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, usage, List.of(1000, 1001)); + when(activeQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupUsage.getReduceByFor(any(), any())).thenReturn(0.001); + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, activeQueryGroupUsage, 0.005); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, deletedQueryGroupUsage, List.of(1000, 1001)); queryGroupLevelViews.put(queryGroupId1, mockView1); queryGroupLevelViews.put(queryGroupId2, mockView2); activeQueryGroups.add(activeQueryGroup); @@ -299,7 +316,8 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeNotInDuress() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage activeQueryGroupUsage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage deletedQueryGroupUsage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup activeQueryGroup = new QueryGroup( @@ -318,8 +336,12 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN 1L ); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, usage); - QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, usage, List.of(1000, 1001)); + when(activeQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupUsage.getReduceByFor(any(), any())).thenReturn(0.001); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, activeQueryGroupUsage, 0.001); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, deletedQueryGroupUsage, List.of(1000, 1001)); queryGroupLevelViews.put(queryGroupId1, mockView1); queryGroupLevelViews.put(queryGroupId2, mockView2); activeQueryGroups.add(activeQueryGroup); @@ -356,7 +378,8 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage usage1 = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage usage2 = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -375,8 +398,10 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { 1L ); - queryGroupLevelViews.put(queryGroupId1, createResourceUsageViewMock(resourceType, usage)); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(usage1.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(usage2.isBreachingThresholdFor(any(), any())).thenReturn(true); + queryGroupLevelViews.put(queryGroupId1, createResourceUsageViewMock(resourceType, usage1, 0.0001)); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage2, 0.0001); when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(5678), getRandomSearchTask(8765))); queryGroupLevelViews.put(queryGroupId2, mockView); Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); @@ -407,9 +432,9 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { assertTrue(cancellableTasksFrom1.get(1).getTask().isCancelled()); } - public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresholds() { + public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - long usage = 1L; + QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -419,8 +444,8 @@ public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresho Map.of(resourceType, threshold), 1L ); - - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, queryGroupCpuUsage, 0.001); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -428,9 +453,9 @@ public void testGetAllCancellableTasks_ReturnsNoTasksFromWhenNotBreachingThresho assertTrue(allCancellableTasks.isEmpty()); } - public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() { + public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -440,8 +465,8 @@ public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() Map.of(resourceType, threshold), 1L ); - - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); + when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, queryGroupCpuUsage, 0.001); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -453,26 +478,25 @@ public void testGetAllCancellableTasks_ReturnsTasksFromWhenBreachingThresholds() public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFound() { ResourceType resourceType = ResourceType.CPU; - long usage = 150_000_000_000L; + QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( - "testQueryGroup", + "testQueryGroup1", queryGroupId1, QueryGroup.ResiliencyMode.ENFORCED, Map.of(resourceType, threshold), 1L ); QueryGroup queryGroup2 = new QueryGroup( - "testQueryGroup", + "testQueryGroup2", queryGroupId2, QueryGroup.ResiliencyMode.ENFORCED, Map.of(resourceType, threshold), 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage); - + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.001); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); activeQueryGroups.add(queryGroup2); @@ -481,21 +505,22 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou assertEquals(0, cancellableTasksFrom.size()); } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, Long usage) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, QueryGroupResourceUsage mockUsage, double usageVal) { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); - when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1234), getRandomSearchTask(4321))); + when(mockUsage.getReduceByFor(any(), any())).thenReturn(usageVal); + when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, mockUsage)); return mockView; } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, Long usage, Collection ids) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, QueryGroupResourceUsage usage, Collection ids) { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); when(mockView.getActiveTasks()).thenReturn(ids.stream().map(this::getRandomSearchTask).collect(Collectors.toList())); return mockView; } - private Task getRandomSearchTask(long id) { + private QueryGroupTask getRandomSearchTask(long id) { return new SearchTask( id, "transport", diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index 7a59cb43ded5b..c7e9f2f386ee4 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -18,27 +18,23 @@ import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.Task; import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; import java.util.ArrayList; import java.util.Collections; import java.util.List; -public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { - - public void testSelectTasksFromDeletedQueryGroup() { - DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); - long thresholdInLong = 100L; - long reduceBy = Long.MIN_VALUE; - List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksFromDeletedQueryGroup(tasks); - assertFalse(selectedTasks.isEmpty()); - assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); - } +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { + private TestClock clock; public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { - DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); + clock = new TestClock(); + DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(clock::getTime); long thresholdInLong = 100L; - long reduceBy = 50L; + double reduceBy = 50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); @@ -49,7 +45,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; - long reduceBy = -50L; + double reduceBy = -50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); try { @@ -63,18 +59,18 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); long thresholdInLong = 100L; - long reduceBy = 0; + double reduceBy = 0.0; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); assertTrue(selectedTasks.isEmpty()); } - private boolean tasksUsageMeetsThreshold(List selectedTasks, long threshold) { - long memory = 0; + private boolean tasksUsageMeetsThreshold(List selectedTasks, double threshold) { + double memory = 0; for (Task task : selectedTasks) { - memory += task.getTotalResourceUtilization(ResourceStats.MEMORY); - if (memory > threshold) { + memory += TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, clock::getTime); + if ((memory - threshold) > MIN_VALUE ) { return true; } } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java index ca2891cb532f2..e7767f037bb57 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java @@ -35,17 +35,35 @@ import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; public class QueryGroupResourceUsageTrackerServiceTests extends OpenSearchTestCase { TestThreadPool threadPool; TaskResourceTrackingService mockTaskResourceTrackingService; QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService; + public static class TestClock { + long time; + + public void fastForwardBy(long nanos) { + time += nanos; + } + + public long getTime() { + return time; + } + } + + TestClock clock; + @Before public void setup() { + clock = new TestClock(); threadPool = new TestThreadPool(getTestName()); mockTaskResourceTrackingService = mock(TaskResourceTrackingService.class); - queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService); + queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService, clock::getTime); } @After @@ -58,13 +76,20 @@ public void testConstructQueryGroupLevelViews_CreatesQueryGroupLevelUsageView_Wh Map activeSearchShardTasks = createActiveSearchShardTasks(queryGroupIds); when(mockTaskResourceTrackingService.getResourceAwareTasks()).thenReturn(activeSearchShardTasks); + clock.fastForwardBy(2000); Map stringQueryGroupLevelResourceUsageViewMap = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); for (String queryGroupId : queryGroupIds) { assertEquals( - 400, - (long) stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.MEMORY) + (400 * 1.0f) / HEAP_SIZE_BYTES, + stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), + MIN_VALUE + ); + assertEquals( + (200 * 1.0f) / (PROCESSOR_COUNT * 2000), + stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), + MIN_VALUE ); assertEquals(2, stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getActiveTasks().size()); } @@ -81,11 +106,12 @@ public void testConstructQueryGroupLevelUsageViews_WithTasksHavingDifferentResou activeSearchShardTasks.put(1L, createMockTask(SearchShardTask.class, 100, 200, "queryGroup1")); activeSearchShardTasks.put(2L, createMockTask(SearchShardTask.class, 200, 400, "queryGroup1")); when(mockTaskResourceTrackingService.getResourceAwareTasks()).thenReturn(activeSearchShardTasks); - + clock.fastForwardBy(2000); Map queryGroupViews = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); - assertEquals(600, (long) queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY)); + assertEquals((double)600 / HEAP_SIZE_BYTES, queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), MIN_VALUE); + assertEquals(((double)300) / (PROCESSOR_COUNT * 2000), queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), MIN_VALUE); assertEquals(2, queryGroupViews.get("queryGroup1").getActiveTasks().size()); } From e26e525c493a3ee6c7fd23f21e698e2c701a7729 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 4 Sep 2024 00:11:01 -0700 Subject: [PATCH 15/47] add unit tests and fix existing ones Signed-off-by: Kaushal Kumar --- .../wlm/QueryGroupLevelResourceUsageView.java | 1 - .../java/org/opensearch/wlm/ResourceType.java | 3 - .../cancellation/DefaultTaskCancellation.java | 15 +- .../wlm/tracker/QueryGroupResourceUsage.java | 4 +- ...QueryGroupResourceUsageTrackerService.java | 12 +- .../tracker/TaskResourceUsageCalculator.java | 1 - ...QueryGroupLevelResourceUsageViewTests.java | 48 ++--- .../org/opensearch/wlm/ResourceTypeTests.java | 6 - .../DefaultTaskCancellationTests.java | 202 ++++++++++++++---- .../DefaultTaskSelectionStrategyTests.java | 9 +- .../tracker/QueryGroupResourceUsageTests.java | 98 +++++++++ ...GroupResourceUsageTrackerServiceTests.java | 23 +- .../TaskResourceUsageCalculatorTests.java | 52 +++++ 13 files changed, 367 insertions(+), 107 deletions(-) create mode 100644 server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java create mode 100644 server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java index bb69acc66cb90..dbe942b461b9c 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java @@ -8,7 +8,6 @@ package org.opensearch.wlm; - import org.opensearch.wlm.tracker.QueryGroupResourceUsage; import java.util.List; diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index 961e06849e0fc..e810e2aa8c7ce 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -10,11 +10,8 @@ import org.opensearch.common.annotation.PublicApi; import org.opensearch.core.common.io.stream.StreamOutput; -import org.opensearch.core.tasks.resourcetracker.ResourceStats; -import org.opensearch.tasks.Task; import java.io.IOException; -import java.util.function.Function; /** * Enum to hold the resource type diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index 1995874e0ed71..77eb32d420734 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -9,11 +9,11 @@ package org.opensearch.wlm.cancellation; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.tracker.QueryGroupResourceUsage; @@ -173,7 +173,13 @@ protected List getCancellableTasksFrom(QueryGroup queryGroup) } private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { - return getReduceBy(queryGroup, resourceType) > MIN_VALUE; + if (queryGroup == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { + return false; + } + QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); + return queryGroupResourceUsageView.getResourceUsageData() + .get(resourceType) + .isBreachingThresholdFor(queryGroup, workloadManagementSettings); } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { @@ -225,7 +231,8 @@ protected List getTaskCancellationsForDeletedQueryGroup(QueryG } private double getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { - if (queryGroup.getResourceLimits().get(resourceType) == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { + if (queryGroup.getResourceLimits().get(resourceType) == null + || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { return 0; } final QueryGroupLevelResourceUsageView queryGroupLevelResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java index f848f78e39c1c..34b6be638c72c 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java @@ -24,7 +24,6 @@ public abstract class QueryGroupResourceUsage { private double currentUsage; - /** * getter for value field * @return resource usage value @@ -43,10 +42,9 @@ public static QueryGroupResourceUsage from(ResourceType resourceType) { } else if (resourceType == ResourceType.MEMORY) { return new QueryGroupMemoryUsage(); } - throw new IllegalArgumentException("Invalid resource type: " + resourceType.getName() + ". It is currently not supported in wlm"); + throw new IllegalArgumentException("Invalid resource type: " + resourceType + ". It is currently not supported in wlm"); } - /** * Determines whether {@link QueryGroup} is breaching its threshold for the resource * @param queryGroup diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index a56faf8c0978d..2f7f6544c5207 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -8,9 +8,7 @@ package org.opensearch.wlm.tracker; -import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.monitor.jvm.JvmStats; -import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskResourceTrackingService; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; @@ -39,8 +37,7 @@ public class QueryGroupResourceUsageTrackerService { * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, - Supplier nanoTimeSupplier) { + public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, Supplier nanoTimeSupplier) { this.taskResourceTrackingService = taskResourceTrackingService; this.nanoTimeSupplier = nanoTimeSupplier; } @@ -58,17 +55,14 @@ public Map constructQueryGroupLevelUsa for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { // Compute the QueryGroup resource usage final Map resourceUsage = new HashMap<>(); - for (ResourceType resourceType: TRACKED_RESOURCES) { + for (ResourceType resourceType : TRACKED_RESOURCES) { final QueryGroupResourceUsage queryGroupResourceUsage = QueryGroupResourceUsage.from(resourceType); queryGroupResourceUsage.initialise(queryGroupEntry.getValue(), nanoTimeSupplier); resourceUsage.put(resourceType, queryGroupResourceUsage); } // Add to the QueryGroup View - queryGroupViews.put( - queryGroupEntry.getKey(), - new QueryGroupLevelResourceUsageView(resourceUsage, queryGroupEntry.getValue()) - ); + queryGroupViews.put(queryGroupEntry.getKey(), new QueryGroupLevelResourceUsageView(resourceUsage, queryGroupEntry.getValue())); } return queryGroupViews; } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java index f2c6f6cd82a37..40bf17e2b9704 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java @@ -57,4 +57,3 @@ public double calculateFor(Task task, Supplier nanoTimeSupplier) { } } } - diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 4dfda5033bdb3..4d962929d4a56 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -20,30 +20,30 @@ public class QueryGroupLevelResourceUsageViewTests extends OpenSearchTestCase { Map resourceUsage; List activeTasks; -// public void setUp() throws Exception { -// super.setUp(); -// resourceUsage = Map.of(ResourceType.fromName("memory"), 34L, ResourceType.fromName("cpu"), 12L); -// activeTasks = List.of(getRandomTask(4321)); -// } -// -// public void testGetResourceUsageData() { -// QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( -// resourceUsage, -// activeTasks -// ); -// Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); -// assertTrue(assertResourceUsageData(resourceUsageData)); -// } - -// public void testGetActiveTasks() { -// QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( -// resourceUsage, -// activeTasks -// ); -// List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); -// assertEquals(1, activeTasks.size()); -// assertEquals(4321, activeTasks.get(0).getId()); -// } + // public void setUp() throws Exception { + // super.setUp(); + // resourceUsage = Map.of(ResourceType.fromName("memory"), 34L, ResourceType.fromName("cpu"), 12L); + // activeTasks = List.of(getRandomTask(4321)); + // } + // + // public void testGetResourceUsageData() { + // QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( + // resourceUsage, + // activeTasks + // ); + // Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); + // assertTrue(assertResourceUsageData(resourceUsageData)); + // } + + // public void testGetActiveTasks() { + // QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( + // resourceUsage, + // activeTasks + // ); + // List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); + // assertEquals(1, activeTasks.size()); + // assertEquals(4321, activeTasks.get(0).getId()); + // } private boolean assertResourceUsageData(Map resourceUsageData) { return resourceUsageData.get(ResourceType.fromName("memory")) == 34L && resourceUsageData.get(ResourceType.fromName("cpu")) == 12L; diff --git a/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java b/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java index 2bbf7b529b14b..16bd8b7e66266 100644 --- a/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java +++ b/server/src/test/java/org/opensearch/wlm/ResourceTypeTests.java @@ -8,14 +8,8 @@ package org.opensearch.wlm; -import org.opensearch.action.search.SearchShardTask; -import org.opensearch.core.tasks.resourcetracker.ResourceStats; -import org.opensearch.tasks.CancellableTask; import org.opensearch.test.OpenSearchTestCase; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class ResourceTypeTests extends OpenSearchTestCase { public void testFromName() { diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index 9817f69b3be74..a4ac903bdea6a 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -12,17 +12,17 @@ import org.opensearch.action.search.SearchTask; import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.TaskId; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.junit.Before; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; import org.opensearch.wlm.tracker.QueryGroupResourceUsage; import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupCpuUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; +import org.junit.Before; import java.util.Collection; import java.util.Collections; @@ -90,7 +90,8 @@ public void setup() { public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -101,7 +102,13 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco 1L ); clock.fastForwardBy(1000); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.3); + when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); @@ -120,7 +127,8 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -130,7 +138,13 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { Map.of(resourceType, threshold), 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.15); + when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.15); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); @@ -141,7 +155,8 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMemory() { ResourceType resourceType = ResourceType.MEMORY; - QueryGroupMemoryUsage usage = mock(QueryGroupMemoryUsage.class); + QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -151,10 +166,14 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem Map.of(resourceType, threshold), 1L ); - when(usage.getCurrentUsage()).thenReturn(0.15); - when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(memoryUsage.getCurrentUsage()).thenReturn(0.15); + when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.005); + when(cpuUsage.getCurrentUsage()).thenReturn(0.0); + when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.05); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, resourceType, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -167,7 +186,8 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - QueryGroupMemoryUsage usage = mock(QueryGroupMemoryUsage.class); + QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.9; QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", @@ -176,8 +196,14 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( Map.of(resourceType, threshold), 1L ); - when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.0); + when(memoryUsage.getCurrentUsage()).thenReturn(0.0); + when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.005); + when(cpuUsage.getCurrentUsage()).thenReturn(0.0); + when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); @@ -199,7 +225,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.1); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -218,7 +244,9 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -229,8 +257,16 @@ public void testCancelTasks_cancelsGivenTasks() { 1L ); - when(usage.isBreachingThresholdFor(any(), any())).thenReturn(true); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.005); + when(memoryUsage.getCurrentUsage()).thenReturn(0.15); + when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.005); + when(cpuUsage.getCurrentUsage()).thenReturn(0.16); + when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); + queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -255,8 +291,10 @@ public void testCancelTasks_cancelsGivenTasks() { public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage activeQueryGroupUsage = mock(QueryGroupCpuUsage.class); - QueryGroupCpuUsage deletedQueryGroupUsage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage activeQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage activeQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + QueryGroupCpuUsage deletedQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage deletedQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.01; QueryGroup activeQueryGroup = new QueryGroup( @@ -275,13 +313,34 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { 1L ); - when(activeQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupUsage.getReduceByFor(any(), any())).thenReturn(0.001); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, activeQueryGroupUsage, 0.005); - QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, deletedQueryGroupUsage, List.of(1000, 1001)); + when(activeQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + + when(deletedQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(activeQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(deletedQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + when(activeQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + when(deletedQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(activeQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( + resourceType, + deletedQueryGroupCpuUsage, + List.of(1000, 1001) + ); + + when(mockView1.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, activeQueryGroupCpuUsage, ResourceType.MEMORY, activeQueryGroupMemoryUsage) + ); + when(mockView2.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, deletedQueryGroupCpuUsage, ResourceType.MEMORY, deletedQueryGroupMemoryUsage) + ); queryGroupLevelViews.put(queryGroupId1, mockView1); queryGroupLevelViews.put(queryGroupId2, mockView2); + activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); @@ -316,8 +375,11 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeNotInDuress() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage activeQueryGroupUsage = mock(QueryGroupCpuUsage.class); - QueryGroupCpuUsage deletedQueryGroupUsage = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage activeQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage activeQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + QueryGroupCpuUsage deletedQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage deletedQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + Double threshold = 0.01; QueryGroup activeQueryGroup = new QueryGroup( @@ -336,12 +398,30 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN 1L ); - when(activeQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(activeQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(deletedQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(activeQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(deletedQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(deletedQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(activeQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(deletedQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + when(activeQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); + QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( + resourceType, + deletedQueryGroupCpuUsage, + List.of(1000, 1001) + ); + + when(mockView1.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, activeQueryGroupCpuUsage, ResourceType.MEMORY, activeQueryGroupMemoryUsage) + ); + when(mockView2.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, deletedQueryGroupCpuUsage, ResourceType.MEMORY, deletedQueryGroupMemoryUsage) + ); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(resourceType, activeQueryGroupUsage, 0.001); - QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock(resourceType, deletedQueryGroupUsage, List.of(1000, 1001)); queryGroupLevelViews.put(queryGroupId1, mockView1); queryGroupLevelViews.put(queryGroupId2, mockView2); activeQueryGroups.add(activeQueryGroup); @@ -378,8 +458,10 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage1 = mock(QueryGroupCpuUsage.class); - QueryGroupCpuUsage usage2 = mock(QueryGroupCpuUsage.class); + QueryGroupCpuUsage cpuUsage1 = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage1 = mock(QueryGroupMemoryUsage.class); + QueryGroupCpuUsage cpuUsage2 = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage memoryUsage2 = mock(QueryGroupMemoryUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -398,11 +480,22 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { 1L ); - when(usage1.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(usage2.isBreachingThresholdFor(any(), any())).thenReturn(true); - queryGroupLevelViews.put(queryGroupId1, createResourceUsageViewMock(resourceType, usage1, 0.0001)); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage2, 0.0001); + when(cpuUsage1.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(cpuUsage2.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(memoryUsage2.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(memoryUsage1.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(cpuUsage1.getReduceByFor(any(), any())).thenReturn(0.001); + when(cpuUsage2.getReduceByFor(any(), any())).thenReturn(0.001); + when(memoryUsage2.getReduceByFor(any(), any())).thenReturn(0.0); + when(memoryUsage1.getReduceByFor(any(), any())).thenReturn(0.0); + + QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); + when(mockView1.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage1, ResourceType.MEMORY, memoryUsage1)); + queryGroupLevelViews.put(queryGroupId1, mockView1); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(5678), getRandomSearchTask(8765))); + when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage2, ResourceType.MEMORY, memoryUsage2)); queryGroupLevelViews.put(queryGroupId2, mockView); Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); @@ -435,6 +528,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage queryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -445,7 +539,13 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( 1L ); when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, queryGroupCpuUsage, 0.001); + when(queryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.0); + when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.0); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, queryGroupCpuUsage, ResourceType.MEMORY, queryGroupMemoryUsage) + ); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -456,6 +556,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); + QueryGroupMemoryUsage queryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -466,7 +567,14 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { 1L ); when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, queryGroupCpuUsage, 0.001); + when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.005); + when(queryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(queryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); + when(mockView.getResourceUsageData()).thenReturn( + Map.of(ResourceType.CPU, queryGroupCpuUsage, ResourceType.MEMORY, queryGroupMemoryUsage) + ); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -496,7 +604,7 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou 1L ); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(resourceType, usage, 0.001); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); activeQueryGroups.add(queryGroup2); @@ -505,15 +613,17 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou assertEquals(0, cancellableTasksFrom.size()); } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, QueryGroupResourceUsage mockUsage, double usageVal) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock() { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1234), getRandomSearchTask(4321))); - when(mockUsage.getReduceByFor(any(), any())).thenReturn(usageVal); - when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, mockUsage)); return mockView; } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, QueryGroupResourceUsage usage, Collection ids) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock( + ResourceType resourceType, + QueryGroupResourceUsage usage, + Collection ids + ) { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); when(mockView.getActiveTasks()).thenReturn(ids.stream().map(this::getRandomSearchTask).collect(Collectors.toList())); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java index c7e9f2f386ee4..f5f0e79903e30 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java @@ -14,12 +14,12 @@ import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; import org.opensearch.core.tasks.resourcetracker.ResourceUsageMetric; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; import org.opensearch.tasks.Task; import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; +import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; import java.util.ArrayList; import java.util.Collections; @@ -30,6 +30,7 @@ public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { private TestClock clock; + public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { clock = new TestClock(); DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(clock::getTime); @@ -70,7 +71,7 @@ private boolean tasksUsageMeetsThreshold(List selectedTasks, dou double memory = 0; for (Task task : selectedTasks) { memory += TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, clock::getTime); - if ((memory - threshold) > MIN_VALUE ) { + if ((memory - threshold) > MIN_VALUE) { return true; } } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java new file mode 100644 index 0000000000000..a755287c79336 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java @@ -0,0 +1,98 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupCpuUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; + +import java.util.List; +import java.util.Map; + +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class QueryGroupResourceUsageTests extends OpenSearchTestCase { + QueryGroupResourceUsage sut; + WorkloadManagementSettings settings; + + public void testFactoryMethods() { + assertTrue(QueryGroupResourceUsage.from(ResourceType.CPU) instanceof QueryGroupCpuUsage); + assertTrue(QueryGroupResourceUsage.from(ResourceType.MEMORY) instanceof QueryGroupMemoryUsage); + assertThrows(IllegalArgumentException.class, () -> QueryGroupResourceUsage.from(null)); + } + + public void testQueryGroupCpuUsage() { + sut = new QueryGroupCpuUsage(); + TestClock clock = new TestClock(); + long fastForwardTime = PROCESSOR_COUNT * 200L; + clock.fastForwardBy(fastForwardTime); + QueryGroup queryGroup = new QueryGroup( + "testQG", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT) + ); + + sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123)), clock::getTime); + settings = mock(WorkloadManagementSettings.class); + when(settings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); + + double expectedNormalisedThreshold = 0.5 / PROCESSOR_COUNT * 0.9; + double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; + double expectedReduceBy = expectedQueryGroupCpuUsage - expectedNormalisedThreshold; + assertEquals(expectedNormalisedThreshold, sut.getNormalisedThresholdFor(queryGroup, settings), MIN_VALUE); + assertEquals(expectedQueryGroupCpuUsage, sut.getCurrentUsage(), MIN_VALUE); + assertTrue(sut.isBreachingThresholdFor(queryGroup, settings)); + assertEquals(expectedReduceBy, sut.getReduceByFor(queryGroup, settings), MIN_VALUE); + } + + public void testQueryGroupMemoryUsage() { + sut = new QueryGroupMemoryUsage(); + TestClock clock = new TestClock(); + QueryGroup queryGroup = new QueryGroup( + "testQG", + QueryGroup.ResiliencyMode.ENFORCED, + Map.of(ResourceType.MEMORY, 500.0 / HEAP_SIZE_BYTES) + ); + + sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123)), clock::getTime); + settings = mock(WorkloadManagementSettings.class); + when(settings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.90); + + double expectedNormalisedThreshold = 500.0 / HEAP_SIZE_BYTES * 0.9; + double expectedCurrentUsage = 200.0 / HEAP_SIZE_BYTES; + assertEquals(expectedNormalisedThreshold, sut.getNormalisedThresholdFor(queryGroup, settings), MIN_VALUE); + assertEquals(expectedCurrentUsage, sut.getCurrentUsage(), MIN_VALUE); + assertFalse(sut.isBreachingThresholdFor(queryGroup, settings)); + } + + public static T createMockTaskWithResourceStats( + Class type, + long cpuUsage, + long heapUsage, + long startTimeNanos, + long taskId + ) { + T task = mock(type); + when(task.getTotalResourceUtilization(ResourceStats.CPU)).thenReturn(cpuUsage); + when(task.getTotalResourceUtilization(ResourceStats.MEMORY)).thenReturn(heapUsage); + when(task.getStartTimeNanos()).thenReturn(startTimeNanos); + when(task.getId()).thenReturn(taskId); + return task; + } +} diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java index e7767f037bb57..4e59e88de9baa 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java @@ -31,13 +31,13 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.opensearch.wlm.QueryGroupTask.QUERY_GROUP_ID_HEADER; +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; public class QueryGroupResourceUsageTrackerServiceTests extends OpenSearchTestCase { TestThreadPool threadPool; @@ -83,7 +83,10 @@ public void testConstructQueryGroupLevelViews_CreatesQueryGroupLevelUsageView_Wh for (String queryGroupId : queryGroupIds) { assertEquals( (400 * 1.0f) / HEAP_SIZE_BYTES, - stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), + stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId) + .getResourceUsageData() + .get(ResourceType.MEMORY) + .getCurrentUsage(), MIN_VALUE ); assertEquals( @@ -110,8 +113,16 @@ public void testConstructQueryGroupLevelUsageViews_WithTasksHavingDifferentResou Map queryGroupViews = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); - assertEquals((double)600 / HEAP_SIZE_BYTES, queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), MIN_VALUE); - assertEquals(((double)300) / (PROCESSOR_COUNT * 2000), queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), MIN_VALUE); + assertEquals( + (double) 600 / HEAP_SIZE_BYTES, + queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), + MIN_VALUE + ); + assertEquals( + ((double) 300) / (PROCESSOR_COUNT * 2000), + queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), + MIN_VALUE + ); assertEquals(2, queryGroupViews.get("queryGroup1").getActiveTasks().size()); } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java new file mode 100644 index 0000000000000..c8e7209dff20d --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; +import org.opensearch.wlm.tracker.TaskResourceUsageCalculator.TaskCpuUsageCalculator; +import org.opensearch.wlm.tracker.TaskResourceUsageCalculator.TaskMemoryUsageCalculator; + +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTests.createMockTaskWithResourceStats; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; + +public class TaskResourceUsageCalculatorTests extends OpenSearchTestCase { + TaskResourceUsageCalculator sut; + + public void testFactoryMethod() { + assertTrue(TaskResourceUsageCalculator.from(ResourceType.CPU) instanceof TaskCpuUsageCalculator); + assertTrue(TaskResourceUsageCalculator.from(ResourceType.MEMORY) instanceof TaskMemoryUsageCalculator); + assertThrows(IllegalArgumentException.class, () -> TaskMemoryUsageCalculator.from(null)); + } + + public void testTaskCpuUsageCalculator() { + sut = new TaskCpuUsageCalculator(); + TestClock clock = new TestClock(); + QueryGroupTask task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1); + clock.fastForwardBy(200); + + double expectedUsage = 0.5; + double actualUsage = sut.calculateFor(task, clock::getTime); + assertEquals(expectedUsage, actualUsage, MIN_VALUE); + } + + public void testTaskMemoryUsageCalculator() { + sut = new TaskMemoryUsageCalculator(); + TestClock clock = new TestClock(); + QueryGroupTask task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1); + clock.fastForwardBy(200); + + double expectedUsage = 200.0 / HEAP_SIZE_BYTES; + double actualUsage = sut.calculateFor(task, clock::getTime); + assertEquals(expectedUsage, actualUsage, MIN_VALUE); + } +} From 0ff2b097ba3be60055915303c9513d298743ac0e Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 4 Sep 2024 00:36:08 -0700 Subject: [PATCH 16/47] uncomment the test case Signed-off-by: Kaushal Kumar --- .../wlm/tracker/QueryGroupResourceUsage.java | 7 +- ...QueryGroupLevelResourceUsageViewTests.java | 80 ++++++++++--------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java index 34b6be638c72c..8ac08792cba62 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java @@ -86,7 +86,7 @@ public void initialise(List tasks, Supplier timeSupplier) * @param tasks list of tasks in the query group * @param timeSupplier nano time supplier */ - public abstract double calculateResourceUsage(List tasks, Supplier timeSupplier); + protected abstract double calculateResourceUsage(List tasks, Supplier timeSupplier); /** * class to store cpu usage for the query group @@ -98,7 +98,7 @@ public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagemen } @Override - public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + protected double calculateResourceUsage(List tasks, Supplier timeSupplier) { double usage = tasks.stream().mapToDouble(task -> { return TaskResourceUsageCalculator.from(ResourceType.CPU).calculateFor(task, timeSupplier); }).sum(); @@ -118,12 +118,11 @@ public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagemen } @Override - public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + protected double calculateResourceUsage(List tasks, Supplier timeSupplier) { double usage = tasks.stream().mapToDouble(task -> { return TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, timeSupplier); }).sum(); return usage; } } - } diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 4d962929d4a56..aec9dc829b26c 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -8,55 +8,57 @@ package org.opensearch.wlm; -import org.opensearch.action.search.SearchAction; -import org.opensearch.core.tasks.TaskId; import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.tracker.QueryGroupResourceUsage; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests; -import java.util.Collections; import java.util.List; import java.util.Map; +import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTests.createMockTaskWithResourceStats; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; + public class QueryGroupLevelResourceUsageViewTests extends OpenSearchTestCase { - Map resourceUsage; + Map resourceUsage; List activeTasks; + QueryGroupResourceUsageTrackerServiceTests.TestClock clock; + + public void setUp() throws Exception { + super.setUp(); + QueryGroupResourceUsage.QueryGroupCpuUsage cpuUsage = new QueryGroupResourceUsage.QueryGroupCpuUsage(); + QueryGroupResourceUsage.QueryGroupMemoryUsage memoryUsage = new QueryGroupResourceUsage.QueryGroupMemoryUsage(); + clock = new QueryGroupResourceUsageTrackerServiceTests.TestClock(); + activeTasks = List.of(createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1)); + clock.fastForwardBy(300); + + cpuUsage.initialise(activeTasks, clock::getTime); + memoryUsage.initialise(activeTasks, clock::getTime); + + resourceUsage = Map.of(ResourceType.MEMORY, memoryUsage, ResourceType.CPU, memoryUsage); + } - // public void setUp() throws Exception { - // super.setUp(); - // resourceUsage = Map.of(ResourceType.fromName("memory"), 34L, ResourceType.fromName("cpu"), 12L); - // activeTasks = List.of(getRandomTask(4321)); - // } - // - // public void testGetResourceUsageData() { - // QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( - // resourceUsage, - // activeTasks - // ); - // Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); - // assertTrue(assertResourceUsageData(resourceUsageData)); - // } - - // public void testGetActiveTasks() { - // QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( - // resourceUsage, - // activeTasks - // ); - // List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); - // assertEquals(1, activeTasks.size()); - // assertEquals(4321, activeTasks.get(0).getId()); - // } - - private boolean assertResourceUsageData(Map resourceUsageData) { - return resourceUsageData.get(ResourceType.fromName("memory")) == 34L && resourceUsageData.get(ResourceType.fromName("cpu")) == 12L; + public void testGetResourceUsageData() { + QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( + resourceUsage, + activeTasks + ); + Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); + assertTrue(assertResourceUsageData(resourceUsageData)); } - private QueryGroupTask getRandomTask(long id) { - return new QueryGroupTask( - id, - "transport", - SearchAction.NAME, - "test description", - new TaskId(randomLong() + ":" + randomLong()), - Collections.emptyMap() + public void testGetActiveTasks() { + QueryGroupLevelResourceUsageView queryGroupLevelResourceUsageView = new QueryGroupLevelResourceUsageView( + resourceUsage, + activeTasks ); + List activeTasks = queryGroupLevelResourceUsageView.getActiveTasks(); + assertEquals(1, activeTasks.size()); + assertEquals(1, activeTasks.get(0).getId()); + } + + private boolean assertResourceUsageData(Map resourceUsageData) { + return (resourceUsageData.get(ResourceType.MEMORY).getCurrentUsage() - 200.0 / HEAP_SIZE_BYTES) <= MIN_VALUE + && (resourceUsageData.get(ResourceType.CPU).getCurrentUsage() - 100.0 / (300)) < MIN_VALUE; } } From ddb8dce52667e3be121d7684ab6e46b178d40e74 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 4 Sep 2024 00:40:11 -0700 Subject: [PATCH 17/47] update CHANGELOG Signed-off-by: Kaushal Kumar --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f65aba38c2ad5..b3fc588d52b8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add took time to request nodes stats ([#15054](https://github.com/opensearch-project/OpenSearch/pull/15054)) - [Workload Management] Add Get QueryGroup API Logic ([14709](https://github.com/opensearch-project/OpenSearch/pull/14709)) - [Workload Management] Add Settings for Workload Management feature ([#15028](https://github.com/opensearch-project/OpenSearch/pull/15028)) -- [Workload Management] QueryGroup resource cancellation framework changes ([#15151](https://github.com/opensearch-project/OpenSearch/pull/15151)) +- [Workload Management] QueryGroup resource cancellation framework changes ([#15651](https://github.com/opensearch-project/OpenSearch/pull/15651)) - [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897)) - Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774)) - Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153)) From e8366a55795b3ebb4a5ec881c4a899c15df63fff Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 4 Sep 2024 07:46:43 -0700 Subject: [PATCH 18/47] fix imports Signed-off-by: Kaushal Kumar --- .../cancellation/DefaultTaskCancellation.java | 14 ++-- .../DefaultTaskCancellationTests.java | 68 ++++++++----------- .../tracker/QueryGroupResourceUsageTests.java | 8 +-- 3 files changed, 37 insertions(+), 53 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java index 77eb32d420734..9e45f278ce6e2 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java @@ -11,6 +11,7 @@ import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.TaskCancellation; +import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; @@ -76,7 +77,7 @@ public DefaultTaskCancellation( */ public final void cancelTasks() { // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits - cancelTasks(QueryGroup.ResiliencyMode.ENFORCED); + cancelTasks(ResiliencyMode.ENFORCED); // if the node is in duress, cancel tasks accordingly. handleNodeDuress(); } @@ -86,10 +87,7 @@ private void handleNodeDuress() { return; } // List of tasks to be executed in order if the node is in duress - List> duressActions = List.of( - v -> cancelTasksFromDeletedQueryGroups(), - v -> cancelTasks(QueryGroup.ResiliencyMode.SOFT) - ); + List> duressActions = List.of(v -> cancelTasksFromDeletedQueryGroups(), v -> cancelTasks(ResiliencyMode.SOFT)); for (Consumer duressAction : duressActions) { if (!isNodeInDuress.getAsBoolean()) { @@ -108,7 +106,7 @@ private void cancelTasksFromDeletedQueryGroups() { * * @return List of tasks that can be cancelled */ - protected List getAllCancellableTasks(QueryGroup.ResiliencyMode resiliencyMode) { + protected List getAllCancellableTasks(ResiliencyMode resiliencyMode) { return getAllCancellableTasks(getQueryGroupsToCancelFrom(resiliencyMode)); } @@ -126,7 +124,7 @@ protected List getAllCancellableTasks(Collection q * * @return List of QueryGroups */ - private List getQueryGroupsToCancelFrom(QueryGroup.ResiliencyMode resiliencyMode) { + private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMode) { final List queryGroupsToCancelFrom = new ArrayList<>(); for (QueryGroup queryGroup : this.activeQueryGroups) { @@ -151,7 +149,7 @@ private List getQueryGroupsToCancelFrom(QueryGroup.ResiliencyMode re return queryGroupsToCancelFrom; } - private void cancelTasks(QueryGroup.ResiliencyMode resiliencyMode) { + private void cancelTasks(ResiliencyMode resiliencyMode) { cancelTasks(getAllCancellableTasks(resiliencyMode)); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java index a4ac903bdea6a..fd4875de18bcd 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java @@ -14,6 +14,8 @@ import org.opensearch.core.tasks.TaskId; import org.opensearch.tasks.TaskCancellation; import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.MutableQueryGroupFragment; +import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; @@ -97,8 +99,7 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); clock.fastForwardBy(1000); @@ -134,8 +135,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); @@ -162,8 +162,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); when(memoryUsage.getCurrentUsage()).thenReturn(0.15); @@ -178,7 +177,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -192,8 +191,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); when(memoryUsage.getCurrentUsage()).thenReturn(0.0); @@ -220,8 +218,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); @@ -238,7 +235,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { () -> false ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.SOFT); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); assertEquals(0, cancellableTasksFrom.size()); } @@ -252,8 +249,7 @@ public void testCancelTasks_cancelsGivenTasks() { QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); @@ -279,7 +275,7 @@ public void testCancelTasks_cancelsGivenTasks() { () -> false ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -300,16 +296,14 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { QueryGroup activeQueryGroup = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); QueryGroup deletedQueryGroup = new QueryGroup( "testQueryGroup", queryGroupId2, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); @@ -353,7 +347,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { () -> true ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -385,16 +379,14 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN QueryGroup activeQueryGroup = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); QueryGroup deletedQueryGroup = new QueryGroup( "testQueryGroup", queryGroupId2, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); @@ -436,7 +428,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN () -> false ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -467,16 +459,14 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); QueryGroup queryGroup2 = new QueryGroup( "testQueryGroup", queryGroupId2, - QueryGroup.ResiliencyMode.SOFT, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.SOFT, Map.of(resourceType, threshold)), 1L ); @@ -508,12 +498,12 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { () -> true ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.SOFT); + List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); assertEquals(2, cancellableTasksFrom1.size()); assertEquals(5678, cancellableTasksFrom1.get(0).getTask().getId()); assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); @@ -534,8 +524,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); @@ -549,7 +538,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List allCancellableTasks = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertTrue(allCancellableTasks.isEmpty()); } @@ -562,8 +551,7 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); @@ -578,7 +566,7 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - List allCancellableTasks = taskCancellation.getAllCancellableTasks(QueryGroup.ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, allCancellableTasks.size()); assertEquals(1234, allCancellableTasks.get(0).getTask().getId()); assertEquals(4321, allCancellableTasks.get(1).getTask().getId()); @@ -592,15 +580,13 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup1", queryGroupId1, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); QueryGroup queryGroup2 = new QueryGroup( "testQueryGroup2", queryGroupId2, - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(resourceType, threshold), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java index a755287c79336..16c47e956a841 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java @@ -11,6 +11,8 @@ import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.MutableQueryGroupFragment; +import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; @@ -44,8 +46,7 @@ public void testQueryGroupCpuUsage() { clock.fastForwardBy(fastForwardTime); QueryGroup queryGroup = new QueryGroup( "testQG", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT) + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT)) ); sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123)), clock::getTime); @@ -66,8 +67,7 @@ public void testQueryGroupMemoryUsage() { TestClock clock = new TestClock(); QueryGroup queryGroup = new QueryGroup( "testQG", - QueryGroup.ResiliencyMode.ENFORCED, - Map.of(ResourceType.MEMORY, 500.0 / HEAP_SIZE_BYTES) + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 500.0 / HEAP_SIZE_BYTES)) ); sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123)), clock::getTime); From d2d02e350c3d172efa98d351b5e2184863803820 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 4 Sep 2024 14:41:10 -0700 Subject: [PATCH 19/47] add queryGroupService Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 21 ++- .../org/opensearch/wlm/QueryGroupService.java | 155 +++++++++++++++++- .../main/java/org/opensearch/wlm/WlmMode.java | 36 ++++ .../wlm/WorkloadManagementSettings.java | 73 +++++++++ ...adManagementTransportInterceptorTests.java | 33 +++- ...eryGroupRequestOperationListenerTests.java | 51 ++++-- 6 files changed, 346 insertions(+), 23 deletions(-) create mode 100644 server/src/main/java/org/opensearch/wlm/WlmMode.java diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index c55c2ab683b11..cc8fb3515cb2a 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -270,6 +270,8 @@ import org.opensearch.usage.UsageService; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.QueryGroupService; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; +import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.WorkloadManagementTransportInterceptor; import org.opensearch.wlm.listeners.QueryGroupRequestOperationListener; @@ -1023,8 +1025,21 @@ protected Node( List identityAwarePlugins = pluginsService.filterPlugins(IdentityAwarePlugin.class); identityService.initializeIdentityAwarePlugins(identityAwarePlugins); - final QueryGroupService queryGroupService = new QueryGroupService(); // We will need to replace this with actual instance of the - // queryGroupService + QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( + taskResourceTrackingService, + System::nanoTime + ); + WorkloadManagementSettings workloadManagementSettings = new WorkloadManagementSettings( + settings, + settingsModule.getClusterSettings() + ); + final QueryGroupService queryGroupService = new QueryGroupService( + queryGroupResourceUsageTrackerService, + clusterService, + threadPool, + workloadManagementSettings + ); + final QueryGroupRequestOperationListener queryGroupRequestOperationListener = new QueryGroupRequestOperationListener( queryGroupService, threadPool @@ -1090,7 +1105,7 @@ protected Node( WorkloadManagementTransportInterceptor workloadManagementTransportInterceptor = new WorkloadManagementTransportInterceptor( threadPool, - new QueryGroupService() // We will need to replace this with actual implementation + queryGroupService ); final Collection secureSettingsFactories = pluginsService.filterPlugins(Plugin.class) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 6545598dd9951..b4fe733177b43 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -7,33 +7,168 @@ */ package org.opensearch.wlm; - +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterStateApplier; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.lifecycle.AbstractLifecycleComponent; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; +import org.opensearch.monitor.jvm.JvmStats; +import org.opensearch.monitor.process.ProcessProbe; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers.NodeDuressTracker; +import org.opensearch.threadpool.Scheduler; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.wlm.cancellation.DefaultTaskCancellation; +import org.opensearch.wlm.cancellation.DefaultTaskSelectionStrategy; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; import org.opensearch.wlm.stats.QueryGroupStats.QueryGroupStatsHolder; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; +import java.io.IOException; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; /** * As of now this is a stub and main implementation PR will be raised soon.Coming PR will collate these changes with core QueryGroupService changes */ -public class QueryGroupService { +public class QueryGroupService extends AbstractLifecycleComponent implements ClusterStateApplier { // This map does not need to be concurrent since we will process the cluster state change serially and update // this map with new additions and deletions of entries. QueryGroupState is thread safe private final Map queryGroupStateMap; + private static final Logger logger = LogManager.getLogger(QueryGroupService.class); + + private final QueryGroupResourceUsageTrackerService queryGroupUsageTracker; + private volatile Scheduler.Cancellable scheduledFuture; + private final ThreadPool threadPool; + private final ClusterService clusterService; + private final WorkloadManagementSettings workloadManagementSettings; + private Set activeQueryGroups = new HashSet<>(); + private Set deletedQueryGroups = new HashSet<>(); + private NodeDuressTrackers nodeDuressTrackers; - public QueryGroupService() { - this(new HashMap<>()); + public QueryGroupService( + QueryGroupResourceUsageTrackerService queryGroupUsageTracker, + ClusterService clusterService, + ThreadPool threadPool, + WorkloadManagementSettings workloadManagementSettings) { + this(queryGroupUsageTracker, clusterService, threadPool, workloadManagementSettings, new HashMap<>()); } - public QueryGroupService(Map queryGroupStateMap) { + public QueryGroupService( + QueryGroupResourceUsageTrackerService queryGroupUsageTracker, + ClusterService clusterService, + ThreadPool threadPool, + WorkloadManagementSettings workloadManagementSettings, + Map queryGroupStateMap + ) { + this.queryGroupUsageTracker = queryGroupUsageTracker; + this.clusterService = clusterService; + this.threadPool = threadPool; + this.workloadManagementSettings = workloadManagementSettings; + this.nodeDuressTrackers = new NodeDuressTrackers( + Map.of(ResourceType.CPU, new NodeDuressTracker(() -> + workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, () -> 3), + ResourceType.MEMORY, new NodeDuressTracker( + () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, () -> 3)) + ); + this.activeQueryGroups = getActiveQueryGroupsFromClusterState(); + + // this logic here is to ensure the proper initialisation of queryGroupState for query groups from persisted metadata this.queryGroupStateMap = queryGroupStateMap; + this.activeQueryGroups.forEach(queryGroup -> queryGroupStateMap.put(queryGroup.get_id(), new QueryGroupState())); } + /** + * run at regular interval + */ + protected void doRun() { + if (workloadManagementSettings.getWlmMode() == WlmMode.DISABLED) { + return; + } + + Map queryGroupLevelResourceUsageViews = queryGroupUsageTracker + .constructQueryGroupLevelUsageViews(); + DefaultTaskCancellation defaultTaskCancellation = new DefaultTaskCancellation( + workloadManagementSettings, + new DefaultTaskSelectionStrategy(), + queryGroupLevelResourceUsageViews, + activeQueryGroups, + deletedQueryGroups, + () -> nodeDuressTrackers.isNodeInDuress() + ); + defaultTaskCancellation.cancelTasks(); + } + + /** + * {@link AbstractLifecycleComponent} lifecycle method + */ + @Override + protected void doStart() { + scheduledFuture = threadPool.scheduleWithFixedDelay(() -> { + try { + doRun(); + } catch (Exception e) { + logger.debug("Exception occurred in Query Sandbox service", e); + } + }, this.workloadManagementSettings.getQueryGroupServiceRunInterval(), ThreadPool.Names.GENERIC); + } + + @Override + protected void doStop() { + if (scheduledFuture != null) { + scheduledFuture.cancel(); + } + } + + @Override + protected void doClose() throws IOException {} + + protected Set getActiveQueryGroupsFromClusterState() { + Map queryGroups = clusterService.state().metadata().queryGroups(); + return new HashSet<>(queryGroups.values()); + } + + @Override + public void applyClusterState(ClusterChangedEvent event) { + // Retrieve the current and previous cluster states + Metadata previousMetadata = event.previousState().metadata(); + Metadata currentMetadata = event.state().metadata(); + + // Extract the query groups from both the current and previous cluster states + Map previousQueryGroups = previousMetadata.queryGroups(); + Map currentQueryGroups = currentMetadata.queryGroups(); + + // Detect new query groups added in the current cluster state + for (String queryGroupName : currentQueryGroups.keySet()) { + if (!previousQueryGroups.containsKey(queryGroupName)) { + // New query group detected + QueryGroup newQueryGroup = currentQueryGroups.get(queryGroupName); + // Perform any necessary actions with the new query group + this.activeQueryGroups.add(newQueryGroup); + } + } + + // Detect query groups deleted in the current cluster state + for (String queryGroupName : previousQueryGroups.keySet()) { + if (!currentQueryGroups.containsKey(queryGroupName)) { + // Query group deleted + QueryGroup deletedQueryGroup = previousQueryGroups.get(queryGroupName); + // Perform any necessary actions with the deleted query group + this.deletedQueryGroups.add(deletedQueryGroup); + } + } + } // tested + /** * updates the failure stats for the query group + * * @param queryGroupId query group identifier */ public void incrementFailuresFor(final String queryGroupId) { @@ -47,7 +182,6 @@ public void incrementFailuresFor(final String queryGroupId) { } /** - * * @return node level query group stats */ public QueryGroupStats nodeStats() { @@ -63,7 +197,6 @@ public QueryGroupStats nodeStats() { } /** - * * @param queryGroupId query group identifier */ public void rejectIfNeeded(String queryGroupId) { @@ -77,4 +210,12 @@ public void rejectIfNeeded(String queryGroupId) { throw new OpenSearchRejectedExecutionException("QueryGroup " + queryGroupId + " is already contended." + reason.toString()); } } + + protected Set getDeletedQueryGroups() { + return deletedQueryGroups; + } + + protected Set getActiveQueryGroups() { + return activeQueryGroups; + } } diff --git a/server/src/main/java/org/opensearch/wlm/WlmMode.java b/server/src/main/java/org/opensearch/wlm/WlmMode.java new file mode 100644 index 0000000000000..06837ed2cacc4 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/WlmMode.java @@ -0,0 +1,36 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm; + +/** + * Enum to hold the values whether wlm is enabled or not + */ +public enum WlmMode { + ENABLED("enabled"), + MONITOR_ONLY("monitor_only"), + DISABLED("disabled"); + + private final String name; + WlmMode(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + public static WlmMode fromName(String name) { + for (WlmMode wlmMode: values()) { + if (wlmMode.getName().equals(name)) { + return wlmMode; + } + } + throw new IllegalArgumentException(name + " is an invalid WlmMode"); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index b104925df77b3..b27cd302470e9 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -11,6 +11,7 @@ import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; /** * Main class to declare Workload Management related settings @@ -20,16 +21,52 @@ public class WorkloadManagementSettings { private static final Double DEFAULT_NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD = 0.9; private static final Double DEFAULT_NODE_LEVEL_CPU_REJECTION_THRESHOLD = 0.8; private static final Double DEFAULT_NODE_LEVEL_CPU_CANCELLATION_THRESHOLD = 0.9; + private static final Long DEFAULT_QUERYGROUP_SERVICE_RUN_INTERVAL_MILLIS = 1000L; public static final double NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD_MAX_VALUE = 0.95; public static final double NODE_LEVEL_MEMORY_REJECTION_THRESHOLD_MAX_VALUE = 0.9; public static final double NODE_LEVEL_CPU_CANCELLATION_THRESHOLD_MAX_VALUE = 0.95; public static final double NODE_LEVEL_CPU_REJECTION_THRESHOLD_MAX_VALUE = 0.9; + public static final String DEFAULT_WLM_MODE = "monitor_only"; private Double nodeLevelMemoryCancellationThreshold; private Double nodeLevelMemoryRejectionThreshold; private Double nodeLevelCpuCancellationThreshold; private Double nodeLevelCpuRejectionThreshold; + /** + * Setting name for Query Group Service run interval + */ + public static final String QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING_NAME = "wlm.query_group.service.run_interval"; + + private TimeValue queryGroupServiceRunInterval; + /** + * Setting to control the run interval of Query Group Service + */ + public static final Setting QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING = Setting.longSetting( + QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING_NAME, + DEFAULT_QUERYGROUP_SERVICE_RUN_INTERVAL_MILLIS, + 1000, + Setting.Property.NodeScope + ); + + /** + * WLM mode setting name + */ + public static final String WLM_MODE_SETTING_NAME = "wlm.query_group.mode"; + + private volatile WlmMode wlmMode; + + /** + * WLM mode setting, which determines which mode WLM is operating in + */ + public static final Setting WLM_MODE_SETTING = new Setting( + WLM_MODE_SETTING_NAME, + DEFAULT_WLM_MODE, + WlmMode::fromName, + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + /** * Setting name for node level memory based rejection threshold for QueryGroup service */ @@ -89,10 +126,12 @@ public class WorkloadManagementSettings { * @param clusterSettings - QueryGroup cluster settings */ public WorkloadManagementSettings(Settings settings, ClusterSettings clusterSettings) { + this.wlmMode = WLM_MODE_SETTING.get(settings); nodeLevelMemoryCancellationThreshold = NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD.get(settings); nodeLevelMemoryRejectionThreshold = NODE_LEVEL_MEMORY_REJECTION_THRESHOLD.get(settings); nodeLevelCpuCancellationThreshold = NODE_LEVEL_CPU_CANCELLATION_THRESHOLD.get(settings); nodeLevelCpuRejectionThreshold = NODE_LEVEL_CPU_REJECTION_THRESHOLD.get(settings); + this.queryGroupServiceRunInterval = TimeValue.timeValueMillis(QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING.get(settings)); ensureRejectionThresholdIsLessThanCancellation( nodeLevelMemoryRejectionThreshold, @@ -111,6 +150,40 @@ public WorkloadManagementSettings(Settings settings, ClusterSettings clusterSett clusterSettings.addSettingsUpdateConsumer(NODE_LEVEL_MEMORY_REJECTION_THRESHOLD, this::setNodeLevelMemoryRejectionThreshold); clusterSettings.addSettingsUpdateConsumer(NODE_LEVEL_CPU_CANCELLATION_THRESHOLD, this::setNodeLevelCpuCancellationThreshold); clusterSettings.addSettingsUpdateConsumer(NODE_LEVEL_CPU_REJECTION_THRESHOLD, this::setNodeLevelCpuRejectionThreshold); + clusterSettings.addSettingsUpdateConsumer(WLM_MODE_SETTING, this::setWlmMode); + clusterSettings.addSettingsUpdateConsumer(QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING, this::setQueryGroupServiceRunInterval); + } + + /** + * queryGroupServiceRunInterval setter + * @param newIntervalInMillis new value + */ + public void setQueryGroupServiceRunInterval(long newIntervalInMillis) { + this.queryGroupServiceRunInterval = TimeValue.timeValueMillis(newIntervalInMillis); + } + + /** + * queryGroupServiceRunInterval getter + * @return current queryGroupServiceRunInterval value + */ + public TimeValue getQueryGroupServiceRunInterval() { + return this.queryGroupServiceRunInterval; + } + + /** + * WlmMode setter + * @param mode new mode value + */ + public void setWlmMode(final WlmMode mode) { + this.wlmMode = mode; + } + + /** + * WlmMode getter + * @return the current wlmMode + */ + public WlmMode getWlmMode() { + return this.wlmMode; } /** diff --git a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java index 4668b845150a9..6d719a8be9df2 100644 --- a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java +++ b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java @@ -8,24 +8,53 @@ package org.opensearch.wlm; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.service.ClusterService; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportRequest; import org.opensearch.transport.TransportRequestHandler; import org.opensearch.wlm.WorkloadManagementTransportInterceptor.RequestHandler; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; +import java.util.Collections; +import java.util.HashMap; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import static org.opensearch.threadpool.ThreadPool.Names.SAME; public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestCase { - + private QueryGroupResourceUsageTrackerService mockQueryGroupUsageTracker; + private ClusterService mockClusterService; + private ThreadPool mockThreadPool; + private WorkloadManagementSettings mockWorkloadManagementSettings; private ThreadPool threadPool; private WorkloadManagementTransportInterceptor sut; public void setUp() throws Exception { super.setUp(); + mockQueryGroupUsageTracker = mock(QueryGroupResourceUsageTrackerService.class); + mockClusterService = mock(ClusterService.class); + mockThreadPool = mock(ThreadPool.class); + mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); threadPool = new TestThreadPool(getTestName()); - sut = new WorkloadManagementTransportInterceptor(threadPool, new QueryGroupService()); + ClusterState state = mock(ClusterState.class); + Metadata metadata = mock(Metadata.class); + when(mockClusterService.state()).thenReturn(state); + when(state.metadata()).thenReturn(metadata); + when(metadata.queryGroups()).thenReturn(Collections.emptyMap()); + sut = new WorkloadManagementTransportInterceptor(threadPool, + new QueryGroupService( + mockQueryGroupUsageTracker, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + new HashMap<>() + ) + ); } public void tearDown() throws Exception { diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 0307ff623c408..6359f8a8f9752 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -8,38 +8,45 @@ package org.opensearch.wlm.listeners; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.util.concurrent.ThreadContext; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; -import static org.mockito.Mockito.doNothing; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.*; +import static org.mockito.Mockito.when; public class QueryGroupRequestOperationListenerTests extends OpenSearchTestCase { public static final int ITERATIONS = 20; ThreadPool testThreadPool; QueryGroupService queryGroupService; - + private QueryGroupResourceUsageTrackerService mockQueryGroupUsageTracker; + private ClusterService mockClusterService; + private WorkloadManagementSettings mockWorkloadManagementSettings; Map queryGroupStateMap; String testQueryGroupId; QueryGroupRequestOperationListener sut; public void setUp() throws Exception { super.setUp(); + mockQueryGroupUsageTracker = mock(QueryGroupResourceUsageTrackerService.class); + mockClusterService = mock(ClusterService.class); + mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); queryGroupStateMap = new HashMap<>(); testQueryGroupId = "safjgagnakg-3r3fads"; testThreadPool = new TestThreadPool("RejectionTestThreadPool"); @@ -93,8 +100,14 @@ public void testValidQueryGroupRequestFailure() throws IOException { public void testMultiThreadedValidQueryGroupRequestFailures() { queryGroupStateMap.put(testQueryGroupId, new QueryGroupState()); - - queryGroupService = new QueryGroupService(queryGroupStateMap); + setupMockedQueryGroupsFromClusterState(); + queryGroupService = new QueryGroupService( + mockQueryGroupUsageTracker, + mockClusterService, + testThreadPool, + mockWorkloadManagementSettings, + queryGroupStateMap + ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); @@ -174,7 +187,15 @@ private void assertSuccess( testThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, threadContextQG_Id); queryGroupStateMap.put(testQueryGroupId, new QueryGroupState()); - queryGroupService = new QueryGroupService(queryGroupStateMap); + setupMockedQueryGroupsFromClusterState(); + + queryGroupService = new QueryGroupService( + mockQueryGroupUsageTracker, + mockClusterService, + testThreadPool, + mockWorkloadManagementSettings, + queryGroupStateMap + ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); sut.onRequestFailure(null, null); @@ -184,4 +205,12 @@ private void assertSuccess( } } + + private void setupMockedQueryGroupsFromClusterState() { + ClusterState state = mock(ClusterState.class); + Metadata metadata = mock(Metadata.class); + when(mockClusterService.state()).thenReturn(state); + when(state.metadata()).thenReturn(metadata); + when(metadata.queryGroups()).thenReturn(Collections.emptyMap()); + } } From 448ea411b9392c9c1948ed74479ca835dd982d01 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 5 Sep 2024 13:42:14 -0700 Subject: [PATCH 20/47] refactor and add UTs for new constructs Signed-off-by: Kaushal Kumar --- .../wlm/QueryGroupLevelResourceUsageView.java | 8 +- .../java/org/opensearch/wlm/ResourceType.java | 1 - ...estTaskRunningFirstSelectionStrategy.java} | 28 +- ...skCancellation.java => TaskCanceller.java} | 52 ++-- .../cancellation/TaskSelectionStrategy.java | 28 ++ .../wlm/tracker/CpuUsageCalculator.java | 47 ++++ .../wlm/tracker/MemoryUsageCalculator.java | 40 +++ .../wlm/tracker/QueryGroupResourceUsage.java | 128 --------- ...QueryGroupResourceUsageTrackerService.java | 22 +- .../wlm/tracker/ResourceUsageCalculator.java | 35 +++ .../ResourceUsageCalculatorFactory.java | 33 +++ .../wlm/tracker/ResourceUsageUtil.java | 74 ++++++ .../wlm/tracker/ResourceUsageUtilFactory.java | 33 +++ .../tracker/TaskResourceUsageCalculator.java | 59 ----- ...QueryGroupLevelResourceUsageViewTests.java | 36 +-- ...skRunningFirstSelectionStrategyTests.java} | 41 ++- ...tionTests.java => TaskCancellerTests.java} | 244 +++++++++--------- .../tracker/QueryGroupResourceUsageTests.java | 98 ------- .../tracker/ResourceUsageCalculatorTests.java | 84 ++++++ ...ceUsageCalculatorTrackerServiceTests.java} | 33 ++- .../wlm/tracker/ResourceUsageUtilTests.java | 70 +++++ .../TaskResourceUsageCalculatorTests.java | 52 ---- 22 files changed, 688 insertions(+), 558 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{DefaultTaskSelectionStrategy.java => LongestTaskRunningFirstSelectionStrategy.java} (68%) rename server/src/main/java/org/opensearch/wlm/cancellation/{DefaultTaskCancellation.java => TaskCanceller.java} (81%) create mode 100644 server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java delete mode 100644 server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java create mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java delete mode 100644 server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java rename server/src/test/java/org/opensearch/wlm/cancellation/{DefaultTaskSelectionStrategyTests.java => LongestTaskRunningFirstSelectionStrategyTests.java} (68%) rename server/src/test/java/org/opensearch/wlm/cancellation/{DefaultTaskCancellationTests.java => TaskCancellerTests.java} (71%) delete mode 100644 server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java create mode 100644 server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java rename server/src/test/java/org/opensearch/wlm/tracker/{QueryGroupResourceUsageTrackerServiceTests.java => ResourceUsageCalculatorTrackerServiceTests.java} (86%) create mode 100644 server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java delete mode 100644 server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java index dbe942b461b9c..de213eaab64a8 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupLevelResourceUsageView.java @@ -8,8 +8,6 @@ package org.opensearch.wlm; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage; - import java.util.List; import java.util.Map; @@ -20,11 +18,11 @@ */ public class QueryGroupLevelResourceUsageView { // resourceUsage holds the resource usage data for a QueryGroup at a point in time - private final Map resourceUsage; + private final Map resourceUsage; // activeTasks holds the list of active tasks for a QueryGroup at a point in time private final List activeTasks; - public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { + public QueryGroupLevelResourceUsageView(Map resourceUsage, List activeTasks) { this.resourceUsage = resourceUsage; this.activeTasks = activeTasks; } @@ -34,7 +32,7 @@ public QueryGroupLevelResourceUsageView(Map getResourceUsageData() { + public Map getResourceUsageData() { return resourceUsage; } diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index b5992448204fc..3fde1bbf1fee5 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -26,7 +26,6 @@ public enum ResourceType { private final String name; private final boolean statsEnabled; - private static List sortedValues = List.of(CPU, MEMORY); ResourceType(String name, boolean statsEnabled) { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java similarity index 68% rename from server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java rename to server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java index 12798fba6e297..26669fc0d84e6 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java @@ -10,7 +10,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorFactory; import java.util.ArrayList; import java.util.Collections; @@ -19,32 +19,35 @@ import java.util.function.Supplier; import java.util.stream.Collectors; -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; /** - * Represents an abstract task selection strategy. - * This class implements the DefaultTaskSelectionStrategy interface and provides a method to select tasks for cancellation based on a sorting condition. - * The specific sorting condition depends on the implementation. + * Represents the longest running task first selection strategy. */ -public class DefaultTaskSelectionStrategy { +public class LongestTaskRunningFirstSelectionStrategy implements TaskSelectionStrategy { private final Supplier nanoTimeSupplier; + private final ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; - public DefaultTaskSelectionStrategy() { - this(System::nanoTime); + public LongestTaskRunningFirstSelectionStrategy() { + this(System::nanoTime, ResourceUsageCalculatorFactory.getInstance()); } - public DefaultTaskSelectionStrategy(Supplier nanoTimeSupplier) { + public LongestTaskRunningFirstSelectionStrategy( + Supplier nanoTimeSupplier, + ResourceUsageCalculatorFactory resourceUsageCalculatorFactory + ) { this.nanoTimeSupplier = nanoTimeSupplier; + this.resourceUsageCalculatorFactory = resourceUsageCalculatorFactory; } /** * Returns a comparator that defines the sorting condition for tasks. - * This is the default implementation since the longest running tasks are the ones that consume the most resources. + * This is the default implementation since the longest running tasks are the likely to regress the performance. * * @return The comparator */ - public Comparator sortingCondition() { + protected Comparator sortingCondition() { return Comparator.comparingLong(QueryGroupTask::getStartTime); } @@ -72,7 +75,8 @@ public List selectTasksForCancellation(List task double accumulated = 0; for (QueryGroupTask task : sortedTasks) { selectedTasks.add(task); - accumulated += TaskResourceUsageCalculator.from(resourceType).calculateFor(task, nanoTimeSupplier); + accumulated += resourceUsageCalculatorFactory.getInstanceForResourceType(resourceType) + .calculateTaskResourceUsage(task, nanoTimeSupplier); if ((accumulated - limit) > MIN_VALUE) { break; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java similarity index 81% rename from server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java rename to server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index 9e45f278ce6e2..4ed7148b0f4fa 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/DefaultTaskCancellation.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -16,7 +16,8 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage; +import org.opensearch.wlm.tracker.ResourceUsageUtil; +import org.opensearch.wlm.tracker.ResourceUsageUtilFactory; import java.util.ArrayList; import java.util.Collection; @@ -30,7 +31,7 @@ /** * Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria. - * This class utilizes a strategy pattern through {@link DefaultTaskSelectionStrategy} to identify tasks that exceed + * This class utilizes a strategy pattern through {@link LongestTaskRunningFirstSelectionStrategy} to identify tasks that exceed * predefined resource usage limits and are therefore eligible for cancellation. * *

The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its @@ -41,35 +42,38 @@ * views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the * identification and cancellation of tasks that threaten to breach QueryGroup resource limits.

* - * @see DefaultTaskSelectionStrategy + * @see LongestTaskRunningFirstSelectionStrategy * @see QueryGroup * @see ResourceType */ -public class DefaultTaskCancellation { +public class TaskCanceller { public static final double MIN_VALUE = 1e-9; protected final WorkloadManagementSettings workloadManagementSettings; - protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy; + protected final TaskSelectionStrategy taskSelectionStrategy; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object protected final Map queryGroupLevelResourceUsageViews; protected final Collection activeQueryGroups; protected final Collection deletedQueryGroups; protected BooleanSupplier isNodeInDuress; + private final ResourceUsageUtilFactory resourceUsageUtilFactory; - public DefaultTaskCancellation( + public TaskCanceller( WorkloadManagementSettings workloadManagementSettings, - DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, + LongestTaskRunningFirstSelectionStrategy taskSelectionStrategy, Map queryGroupLevelResourceUsageViews, Collection activeQueryGroups, Collection deletedQueryGroups, - BooleanSupplier isNodeInDuress + BooleanSupplier isNodeInDuress, + ResourceUsageUtilFactory resourceUsageUtilFactory ) { this.workloadManagementSettings = workloadManagementSettings; - this.defaultTaskSelectionStrategy = defaultTaskSelectionStrategy; + this.taskSelectionStrategy = taskSelectionStrategy; this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; + this.resourceUsageUtilFactory = resourceUsageUtilFactory; } /** @@ -131,13 +135,14 @@ private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMod if (queryGroup.getResiliencyMode() != resiliencyMode) { continue; } - Map queryGroupResourcesUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) + Map queryGroupResourcesUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) .getResourceUsageData(); for (ResourceType resourceType : TRACKED_RESOURCES) { if (queryGroup.getResourceLimits().containsKey(resourceType)) { - final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupResourcesUsage.get(resourceType); - if (queryGroupResourceUsage.isBreachingThresholdFor(queryGroup, workloadManagementSettings)) { + final double currentUsage = queryGroupResourcesUsage.get(resourceType); + final ResourceUsageUtil resourceUsageUtil = resourceUsageUtilFactory.getInstanceForResourceType(resourceType); + if (resourceUsageUtil.isBreachingThresholdFor(queryGroup, currentUsage, workloadManagementSettings)) { queryGroupsToCancelFrom.add(queryGroup); break; } @@ -171,19 +176,13 @@ protected List getCancellableTasksFrom(QueryGroup queryGroup) } private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { - if (queryGroup == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { - return false; - } - QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); - return queryGroupResourceUsageView.getResourceUsageData() - .get(resourceType) - .isBreachingThresholdFor(queryGroup, workloadManagementSettings); + return getExcessUsage(queryGroup, resourceType) > 0; } private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { - List selectedTasksToCancel = defaultTaskSelectionStrategy.selectTasksForCancellation( + List selectedTasksToCancel = taskSelectionStrategy.selectTasksForCancellation( queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), - getReduceBy(queryGroup, resourceType), + getExcessUsage(queryGroup, resourceType), resourceType ); List taskCancellations = new ArrayList<>(); @@ -228,14 +227,17 @@ protected List getTaskCancellationsForDeletedQueryGroup(QueryG return taskCancellations; } - private double getReduceBy(QueryGroup queryGroup, ResourceType resourceType) { + private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) { if (queryGroup.getResourceLimits().get(resourceType) == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { return 0; } - final QueryGroupLevelResourceUsageView queryGroupLevelResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); - final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupLevelResourceUsage.getResourceUsageData().get(resourceType); - return queryGroupResourceUsage.getReduceByFor(queryGroup, workloadManagementSettings); + + final QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); + final double currentUsage = queryGroupResourceUsageView.getResourceUsageData().get(resourceType); + final ResourceUsageUtil resourceUsageUtil = resourceUsageUtilFactory.getInstanceForResourceType(resourceType); + + return resourceUsageUtil.getExcessUsage(queryGroup, currentUsage, workloadManagementSettings); } private void callbackOnCancel() { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java new file mode 100644 index 0000000000000..63fbf9b791a33 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskSelectionStrategy.java @@ -0,0 +1,28 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.cancellation; + +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; + +import java.util.List; + +/** + * This interface exposes a method which implementations can use + */ +public interface TaskSelectionStrategy { + /** + * Determines how the tasks are selected from the list of given tasks based on resource type + * @param tasks to select from + * @param limit min cumulative resource usage sum of selected tasks + * @param resourceType + * @return list of tasks + */ + List selectTasksForCancellation(List tasks, double limit, ResourceType resourceType); +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java new file mode 100644 index 0000000000000..95c9dfa3e2ca1 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java @@ -0,0 +1,47 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.wlm.QueryGroupTask; + +import java.util.List; +import java.util.function.Supplier; + +/** + * class to help make cpu usage calculations for the query group + */ +public class CpuUsageCalculator implements ResourceUsageCalculator { + // This value should be initialised at the start time of the process and be used throughout the codebase + public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); + private static final CpuUsageCalculator instance = new CpuUsageCalculator(); + + private CpuUsageCalculator() {} + + /** + * static method to access the singleton + * @return eager singleton object of the class + */ + public static CpuUsageCalculator getInstance() { + return instance; + } + + @Override + public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + double usage = tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); + + usage /= PROCESSOR_COUNT; + return usage; + } + + @Override + public double calculateTaskResourceUsage(QueryGroupTask task, Supplier nanoTimeSupplier) { + return (1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.get() - task.getStartTimeNanos()); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java new file mode 100644 index 0000000000000..e1ac592360dd3 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java @@ -0,0 +1,40 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.monitor.jvm.JvmStats; +import org.opensearch.wlm.QueryGroupTask; + +import java.util.List; +import java.util.function.Supplier; + +/** + * class to help make memory usage calculations for the query group + */ +public class MemoryUsageCalculator implements ResourceUsageCalculator { + public static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); + private static final MemoryUsageCalculator instance = new MemoryUsageCalculator(); + + private MemoryUsageCalculator() {} + + public static MemoryUsageCalculator getInstance() { + return instance; + } + + @Override + public double calculateResourceUsage(List tasks, Supplier timeSupplier) { + return tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); + } + + @Override + public double calculateTaskResourceUsage(QueryGroupTask task, Supplier timeSupplier) { + return (1.0f * task.getTotalResourceUtilization(ResourceStats.MEMORY)) / HEAP_SIZE_BYTES; + } +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java deleted file mode 100644 index 8ac08792cba62..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsage.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.WorkloadManagementSettings; - -import java.util.List; -import java.util.function.Supplier; - -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; - -/** - * This class is used to track query group level resource usage - */ -public abstract class QueryGroupResourceUsage { - private double currentUsage; - - /** - * getter for value field - * @return resource usage value - */ - public double getCurrentUsage() { - return currentUsage; - } - - public void setCurrentUsage(double currentUsage) { - this.currentUsage = currentUsage; - } - - public static QueryGroupResourceUsage from(ResourceType resourceType) { - if (resourceType == ResourceType.CPU) { - return new QueryGroupCpuUsage(); - } else if (resourceType == ResourceType.MEMORY) { - return new QueryGroupMemoryUsage(); - } - throw new IllegalArgumentException("Invalid resource type: " + resourceType + ". It is currently not supported in wlm"); - } - - /** - * Determines whether {@link QueryGroup} is breaching its threshold for the resource - * @param queryGroup - * @return whether the query group is breaching threshold for this resource - */ - public boolean isBreachingThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return getCurrentUsage() > getNormalisedThresholdFor(queryGroup, settings); - } - - /** - * returns the value by which the resource usage is beyond the configured limit for the query group - * @param queryGroup instance - * @param settings {@link WorkloadManagementSettings} instance - * @return the overshooting limit for the resource - */ - public double getReduceByFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return getCurrentUsage() - getNormalisedThresholdFor(queryGroup, settings); - } - - /** - * initialises the member variable currentUsage - * @param tasks list of tasks in the query group - * @param timeSupplier nano time supplier - */ - public void initialise(List tasks, Supplier timeSupplier) { - this.setCurrentUsage(this.calculateResourceUsage(tasks, timeSupplier)); - } - - /** - * normalises configured value with respect to node level cancellation thresholds - * @param queryGroup instance - * @param settings {@link WorkloadManagementSettings} instance - * @return normalised value with respect to node level cancellation thresholds - */ - public abstract double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings); - - /** - * calculates the current resource usage for the query group - * @param tasks list of tasks in the query group - * @param timeSupplier nano time supplier - */ - protected abstract double calculateResourceUsage(List tasks, Supplier timeSupplier); - - /** - * class to store cpu usage for the query group - */ - public static class QueryGroupCpuUsage extends QueryGroupResourceUsage { - @Override - public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return settings.getNodeLevelCpuCancellationThreshold() * queryGroup.getResourceLimits().get(ResourceType.CPU); - } - - @Override - protected double calculateResourceUsage(List tasks, Supplier timeSupplier) { - double usage = tasks.stream().mapToDouble(task -> { - return TaskResourceUsageCalculator.from(ResourceType.CPU).calculateFor(task, timeSupplier); - }).sum(); - - usage /= PROCESSOR_COUNT; - return usage; - } - } - - /** - * class to store memory usage for the query group - */ - public static class QueryGroupMemoryUsage extends QueryGroupResourceUsage { - @Override - public double getNormalisedThresholdFor(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return settings.getNodeLevelMemoryCancellationThreshold() * queryGroup.getResourceLimits().get(ResourceType.MEMORY); - } - - @Override - protected double calculateResourceUsage(List tasks, Supplier timeSupplier) { - double usage = tasks.stream().mapToDouble(task -> { - return TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, timeSupplier); - }).sum(); - return usage; - } - } -} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index 2f7f6544c5207..dc3dde0c78886 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -8,7 +8,6 @@ package org.opensearch.wlm.tracker; -import org.opensearch.monitor.jvm.JvmStats; import org.opensearch.tasks.TaskResourceTrackingService; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; @@ -25,21 +24,24 @@ * This class tracks resource usage per QueryGroup */ public class QueryGroupResourceUsageTrackerService { - public static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); - // This value should be initialised at the start time of the process and be used throughout the codebase - public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); public static final EnumSet TRACKED_RESOURCES = EnumSet.allOf(ResourceType.class); private final TaskResourceTrackingService taskResourceTrackingService; private final Supplier nanoTimeSupplier; + private final ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; /** * QueryGroupResourceTrackerService constructor * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, Supplier nanoTimeSupplier) { + public QueryGroupResourceUsageTrackerService( + TaskResourceTrackingService taskResourceTrackingService, + Supplier nanoTimeSupplier, + ResourceUsageCalculatorFactory resourceUsageCalculatorFactory + ) { this.taskResourceTrackingService = taskResourceTrackingService; this.nanoTimeSupplier = nanoTimeSupplier; + this.resourceUsageCalculatorFactory = resourceUsageCalculatorFactory; } /** @@ -54,11 +56,13 @@ public Map constructQueryGroupLevelUsa // Iterate over each QueryGroup entry for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { // Compute the QueryGroup resource usage - final Map resourceUsage = new HashMap<>(); + final Map resourceUsage = new HashMap<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - final QueryGroupResourceUsage queryGroupResourceUsage = QueryGroupResourceUsage.from(resourceType); - queryGroupResourceUsage.initialise(queryGroupEntry.getValue(), nanoTimeSupplier); - resourceUsage.put(resourceType, queryGroupResourceUsage); + final ResourceUsageCalculator resourceUsageCalculator = resourceUsageCalculatorFactory.getInstanceForResourceType( + resourceType + ); + double usage = resourceUsageCalculator.calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); + resourceUsage.put(resourceType, usage); } // Add to the QueryGroup View diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java new file mode 100644 index 0000000000000..a5777b36e87e4 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java @@ -0,0 +1,35 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.wlm.QueryGroupTask; + +import java.util.List; +import java.util.function.Supplier; + +/** + * This class is used to track query group level resource usage + */ +public interface ResourceUsageCalculator { + /** + * calculates the current resource usage for the query group + * + * @param tasks list of tasks in the query group + * @param timeSupplier nano time supplier + */ + double calculateResourceUsage(List tasks, Supplier timeSupplier); + + /** + * calculates the task level resource usage + * @param task QueryGroupTask + * @param timeSupplier in nano seconds unit + * @return task level resource usage + */ + double calculateTaskResourceUsage(QueryGroupTask task, Supplier timeSupplier); +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java new file mode 100644 index 0000000000000..c0e9c285ccfa8 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.wlm.ResourceType; + +/** + * Factory class for {@link ResourceUsageCalculator} singleton implementations + */ +public class ResourceUsageCalculatorFactory { + private static ResourceUsageCalculatorFactory instance = new ResourceUsageCalculatorFactory(); + + private ResourceUsageCalculatorFactory() {} + + public static ResourceUsageCalculatorFactory getInstance() { + return instance; + } + + public ResourceUsageCalculator getInstanceForResourceType(ResourceType type) { + if (type == ResourceType.CPU) { + return CpuUsageCalculator.getInstance(); + } else if (type == ResourceType.MEMORY) { + return MemoryUsageCalculator.getInstance(); + } + throw new IllegalArgumentException(type + " is an invalid resource type"); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java new file mode 100644 index 0000000000000..d616dc96088a0 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java @@ -0,0 +1,74 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; + +/** + * Utility class to provide utility methods at query group level + */ +public abstract class ResourceUsageUtil { + /** + * Determines whether {@link QueryGroup} is breaching its threshold for the resource + * @param queryGroup + * @param currentUsage + * @return whether the query group is breaching threshold for this resource + */ + public boolean isBreachingThresholdFor(QueryGroup queryGroup, double currentUsage, WorkloadManagementSettings settings) { + return getExcessUsage(queryGroup, currentUsage, settings) > 0; + } + + /** + * returns the value by which the resource usage is beyond the configured limit for the query group + * @param queryGroup instance + * @return the overshooting limit for the resource + */ + public double getExcessUsage(QueryGroup queryGroup, double currentUsage, WorkloadManagementSettings settings) { + return currentUsage - getNormalisedThreshold(queryGroup, settings); + } + + /** + * normalises configured value with respect to node level cancellation thresholds + * @param queryGroup instance + * @return normalised value with respect to node level cancellation thresholds + */ + protected abstract double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings); + + public static class CpuUsageUtil extends ResourceUsageUtil { + private static final CpuUsageUtil instance = new CpuUsageUtil(); + + private CpuUsageUtil() {} + + public static CpuUsageUtil getInstance() { + return instance; + } + + @Override + protected double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return queryGroup.getResourceLimits().get(ResourceType.CPU) * settings.getNodeLevelCpuCancellationThreshold(); + } + } + + public static class MemoryUsageUtil extends ResourceUsageUtil { + private static final MemoryUsageUtil instance = new MemoryUsageUtil(); + + private MemoryUsageUtil() {} + + public static MemoryUsageUtil getInstance() { + return instance; + } + + @Override + public double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings) { + return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * settings.getNodeLevelMemoryCancellationThreshold(); + } + } +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java new file mode 100644 index 0000000000000..df68cdec30a6a --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.wlm.ResourceType; + +/** + * Factory class for {@link ResourceUsageUtil} implementations + */ +public class ResourceUsageUtilFactory { + private static ResourceUsageUtilFactory instance = new ResourceUsageUtilFactory(); + + private ResourceUsageUtilFactory() {} + + public static ResourceUsageUtilFactory getInstance() { + return instance; + } + + public ResourceUsageUtil getInstanceForResourceType(ResourceType type) { + if (type == ResourceType.CPU) { + return ResourceUsageUtil.CpuUsageUtil.getInstance(); + } else if (type == ResourceType.MEMORY) { + return ResourceUsageUtil.MemoryUsageUtil.getInstance(); + } + throw new IllegalArgumentException(type + " is an invalid resource type"); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java deleted file mode 100644 index 40bf17e2b9704..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculator.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.core.tasks.resourcetracker.ResourceStats; -import org.opensearch.tasks.Task; -import org.opensearch.wlm.ResourceType; - -import java.util.function.Supplier; - -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; - -/** - * Utility class to calculate task level resource usage - */ -public abstract class TaskResourceUsageCalculator { - public static TaskResourceUsageCalculator from(final ResourceType resourceType) { - if (resourceType == ResourceType.CPU) { - return new TaskCpuUsageCalculator(); - } else if (resourceType == ResourceType.MEMORY) { - return new TaskMemoryUsageCalculator(); - } - throw new IllegalArgumentException("Invalid resource type " + resourceType + " . It is not supported in wlm"); - } - - /** - * calculates the resource usage for the task - * @param task {@link Task} instance - * @param nanoTimeSupplier time supplier in nano second unit - * @return task resource usage - */ - public abstract double calculateFor(Task task, Supplier nanoTimeSupplier); - - /** - * This class will return per core cpu usage for a task - */ - public static class TaskCpuUsageCalculator extends TaskResourceUsageCalculator { - @Override - public double calculateFor(Task task, Supplier nanoTimeSupplier) { - return ((1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.get() - task.getStartTimeNanos())); - } - } - - /** - * This class will return allocated bytes by the task since task has been created - */ - public static class TaskMemoryUsageCalculator extends TaskResourceUsageCalculator { - @Override - public double calculateFor(Task task, Supplier nanoTimeSupplier) { - return (1.0f * task.getTotalResourceUtilization(ResourceStats.MEMORY)) / HEAP_SIZE_BYTES; - } - } -} diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index aec9dc829b26c..0d254ad73f9f4 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -9,33 +9,33 @@ package org.opensearch.wlm; import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests; import java.util.List; import java.util.Map; -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTests.createMockTaskWithResourceStats; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; +import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.tracker.ResourceUsageCalculatorTests.createMockTaskWithResourceStats; +import static org.mockito.Mockito.mock; public class QueryGroupLevelResourceUsageViewTests extends OpenSearchTestCase { - Map resourceUsage; + Map resourceUsage; List activeTasks; - QueryGroupResourceUsageTrackerServiceTests.TestClock clock; + ResourceUsageCalculatorTrackerServiceTests.TestClock clock; + WorkloadManagementSettings settings; public void setUp() throws Exception { super.setUp(); - QueryGroupResourceUsage.QueryGroupCpuUsage cpuUsage = new QueryGroupResourceUsage.QueryGroupCpuUsage(); - QueryGroupResourceUsage.QueryGroupMemoryUsage memoryUsage = new QueryGroupResourceUsage.QueryGroupMemoryUsage(); - clock = new QueryGroupResourceUsageTrackerServiceTests.TestClock(); + settings = mock(WorkloadManagementSettings.class); + clock = new ResourceUsageCalculatorTrackerServiceTests.TestClock(); activeTasks = List.of(createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1)); clock.fastForwardBy(300); + double memoryUsage = 200.0 / HEAP_SIZE_BYTES; + double cpuUsage = 100.0 / (PROCESSOR_COUNT * 300.0); - cpuUsage.initialise(activeTasks, clock::getTime); - memoryUsage.initialise(activeTasks, clock::getTime); - - resourceUsage = Map.of(ResourceType.MEMORY, memoryUsage, ResourceType.CPU, memoryUsage); + resourceUsage = Map.of(ResourceType.MEMORY, memoryUsage, ResourceType.CPU, cpuUsage); } public void testGetResourceUsageData() { @@ -43,7 +43,7 @@ public void testGetResourceUsageData() { resourceUsage, activeTasks ); - Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); + Map resourceUsageData = queryGroupLevelResourceUsageView.getResourceUsageData(); assertTrue(assertResourceUsageData(resourceUsageData)); } @@ -57,8 +57,8 @@ public void testGetActiveTasks() { assertEquals(1, activeTasks.get(0).getId()); } - private boolean assertResourceUsageData(Map resourceUsageData) { - return (resourceUsageData.get(ResourceType.MEMORY).getCurrentUsage() - 200.0 / HEAP_SIZE_BYTES) <= MIN_VALUE - && (resourceUsageData.get(ResourceType.CPU).getCurrentUsage() - 100.0 / (300)) < MIN_VALUE; + private boolean assertResourceUsageData(Map resourceUsageData) { + return (resourceUsageData.get(ResourceType.MEMORY) - 200.0 / HEAP_SIZE_BYTES) <= MIN_VALUE + && (resourceUsageData.get(ResourceType.CPU) - 100.0 / (300)) < MIN_VALUE; } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java similarity index 68% rename from server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java index f5f0e79903e30..2fcdc3121c2bf 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java @@ -14,43 +14,51 @@ import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.core.tasks.resourcetracker.ResourceStatsType; import org.opensearch.core.tasks.resourcetracker.ResourceUsageMetric; -import org.opensearch.tasks.Task; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; -import org.opensearch.wlm.tracker.TaskResourceUsageCalculator; +import org.opensearch.wlm.tracker.MemoryUsageCalculator; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorFactory; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; -public class DefaultTaskSelectionStrategyTests extends OpenSearchTestCase { +public class LongestTaskRunningFirstSelectionStrategyTests extends OpenSearchTestCase { private TestClock clock; + private ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { clock = new TestClock(); - DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(clock::getTime); + resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); + LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = + new LongestTaskRunningFirstSelectionStrategy(clock::getTime, resourceUsageCalculatorFactory); long thresholdInLong = 100L; double reduceBy = 50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List selectedTasks = testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation( + tasks, + reduceBy, + resourceType + ); assertFalse(selectedTasks.isEmpty()); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { - DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); + LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = + new LongestTaskRunningFirstSelectionStrategy(); long thresholdInLong = 100L; double reduceBy = -50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); try { - testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); assertEquals("limit has to be greater than zero", e.getMessage()); @@ -58,19 +66,24 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { - DefaultTaskSelectionStrategy testDefaultTaskSelectionStrategy = new DefaultTaskSelectionStrategy(); + LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = + new LongestTaskRunningFirstSelectionStrategy(); long thresholdInLong = 100L; double reduceBy = 0.0; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testDefaultTaskSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + List selectedTasks = testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation( + tasks, + reduceBy, + resourceType + ); assertTrue(selectedTasks.isEmpty()); } private boolean tasksUsageMeetsThreshold(List selectedTasks, double threshold) { double memory = 0; - for (Task task : selectedTasks) { - memory += TaskResourceUsageCalculator.from(ResourceType.MEMORY).calculateFor(task, clock::getTime); + for (QueryGroupTask task : selectedTasks) { + memory += MemoryUsageCalculator.getInstance().calculateTaskResourceUsage(task, clock::getTime); if ((memory - threshold) > MIN_VALUE) { return true; } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java similarity index 71% rename from server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java index fd4875de18bcd..b3ed024a04ad9 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/DefaultTaskCancellationTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java @@ -20,10 +20,10 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupCpuUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; +import org.opensearch.wlm.tracker.ResourceUsageUtil.CpuUsageUtil; +import org.opensearch.wlm.tracker.ResourceUsageUtil.MemoryUsageUtil; +import org.opensearch.wlm.tracker.ResourceUsageUtilFactory; import org.junit.Before; import java.util.Collection; @@ -36,33 +36,35 @@ import java.util.function.BooleanSupplier; import java.util.stream.Collectors; -import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class DefaultTaskCancellationTests extends OpenSearchTestCase { +public class TaskCancellerTests extends OpenSearchTestCase { private static final String queryGroupId1 = "queryGroup1"; private static final String queryGroupId2 = "queryGroup2"; private TestClock clock; + private ResourceUsageUtilFactory resourceUsageUtilFactory; - private static class TestTaskCancellationImpl extends DefaultTaskCancellation { + private static class TestTaskCancellerImpl extends TaskCanceller { - public TestTaskCancellationImpl( + public TestTaskCancellerImpl( WorkloadManagementSettings workloadManagementSettings, - DefaultTaskSelectionStrategy defaultTaskSelectionStrategy, + LongestTaskRunningFirstSelectionStrategy longestTaskRunningFirstSelectionStrategy, Map queryGroupLevelViews, Set activeQueryGroups, Set deletedQueryGroups, - BooleanSupplier isNodeInDuress + BooleanSupplier isNodeInDuress, + ResourceUsageUtilFactory resourceUsageUtilFactory ) { super( workloadManagementSettings, - defaultTaskSelectionStrategy, + longestTaskRunningFirstSelectionStrategy, queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - isNodeInDuress + isNodeInDuress, + resourceUsageUtilFactory ); } } @@ -70,7 +72,9 @@ public TestTaskCancellationImpl( private Map queryGroupLevelViews; private Set activeQueryGroups; private Set deletedQueryGroups; - private DefaultTaskCancellation taskCancellation; + private TaskCanceller taskCancellation; + private CpuUsageUtil cpuUsageUtil; + private MemoryUsageUtil memoryUsageUtil; private WorkloadManagementSettings workloadManagementSettings; @Before @@ -79,21 +83,28 @@ public void setup() { queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); deletedQueryGroups = new HashSet<>(); + resourceUsageUtilFactory = mock(ResourceUsageUtilFactory.class); + cpuUsageUtil = mock(CpuUsageUtil.class); + memoryUsageUtil = mock(MemoryUsageUtil.class); + when(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU)).thenReturn(cpuUsageUtil); + when(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY)).thenReturn(memoryUsageUtil); + clock = new TestClock(); - taskCancellation = new TestTaskCancellationImpl( + taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false + () -> false, + resourceUsageUtilFactory ); } public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.11; + double memoryUsage = 0.0; Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -103,10 +114,9 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco 1L ); clock.fastForwardBy(1000); - when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); - when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); @@ -128,8 +138,8 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.11; + double memoryUsage = 0.0; Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -138,10 +148,9 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.15); + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); @@ -155,8 +164,8 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMemory() { ResourceType resourceType = ResourceType.MEMORY; - QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.0; + double memoryUsage = 0.11; Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -165,11 +174,11 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(memoryUsage.getCurrentUsage()).thenReturn(0.15); - when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.005); - when(cpuUsage.getCurrentUsage()).thenReturn(0.0); - when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.01); + when(memoryUsageUtil.isBreachingThresholdFor(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(true); + when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(false); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.0); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, resourceType, memoryUsage)); @@ -185,8 +194,8 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.91; + double memoryUsage = 0.0; Double threshold = 0.9; QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", @@ -194,17 +203,13 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(memoryUsage.getCurrentUsage()).thenReturn(0.0); - when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(memoryUsage.getReduceByFor(any(), any())).thenReturn(0.005); - when(cpuUsage.getCurrentUsage()).thenReturn(0.0); - when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.0); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); assertTrue(cancellableTasksFrom.isEmpty()); @@ -212,7 +217,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); + double usage = 0.02; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -226,13 +231,14 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false + () -> false, + resourceUsageUtilFactory ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); @@ -241,8 +247,8 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage cpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.011; + double memoryUsage = 0.0; Double threshold = 0.01; @@ -253,12 +259,9 @@ public void testCancelTasks_cancelsGivenTasks() { 1L ); - when(memoryUsage.getCurrentUsage()).thenReturn(0.15); - when(memoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - - when(cpuUsage.getReduceByFor(any(), any())).thenReturn(0.005); - when(cpuUsage.getCurrentUsage()).thenReturn(0.16); - when(cpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(true); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); @@ -266,13 +269,14 @@ public void testCancelTasks_cancelsGivenTasks() { queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false + () -> false, + resourceUsageUtilFactory ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -287,10 +291,10 @@ public void testCancelTasks_cancelsGivenTasks() { public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage activeQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage activeQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); - QueryGroupCpuUsage deletedQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage deletedQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + double activeQueryGroupCpuUsage = 0.0; + double activeQueryGroupMemoryUsage = 0.0; + double deletedQueryGroupCpuUsage = 0.011; + double deletedQueryGroupMemoryUsage = 0.0; Double threshold = 0.01; QueryGroup activeQueryGroup = new QueryGroup( @@ -307,17 +311,15 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { 1L ); - when(activeQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - - when(deletedQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(activeQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - - when(deletedQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); - when(activeQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + when(memoryUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn( + true + ); - when(deletedQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); - when(activeQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); + when(memoryUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(true); QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( @@ -338,13 +340,14 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> true + () -> true, + resourceUsageUtilFactory ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -369,10 +372,10 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeNotInDuress() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage activeQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage activeQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); - QueryGroupCpuUsage deletedQueryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage deletedQueryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + double activeQueryGroupCpuUsage = 0.11; + double activeQueryGroupMemoryUsage = 0.0; + double deletedQueryGroupCpuUsage = 0.11; + double deletedQueryGroupMemoryUsage = 0.0; Double threshold = 0.01; @@ -390,15 +393,15 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN 1L ); - when(activeQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(deletedQueryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(activeQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(deletedQueryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(memoryUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn( + true + ); - when(deletedQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); - when(activeQueryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.001); - when(deletedQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); - when(activeQueryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + when(memoryUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(true); QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( @@ -419,13 +422,14 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false + () -> false, + resourceUsageUtilFactory ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -450,10 +454,10 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage cpuUsage1 = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage1 = mock(QueryGroupMemoryUsage.class); - QueryGroupCpuUsage cpuUsage2 = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage memoryUsage2 = mock(QueryGroupMemoryUsage.class); + double cpuUsage1 = 0.11; + double memoryUsage1 = 0.0; + double cpuUsage2 = 0.11; + double memoryUsage2 = 0.0; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -470,15 +474,13 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { 1L ); - when(cpuUsage1.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(cpuUsage2.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(memoryUsage2.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(memoryUsage1.isBreachingThresholdFor(any(), any())).thenReturn(false); + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage1, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage1, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage1, workloadManagementSettings)).thenReturn(true); - when(cpuUsage1.getReduceByFor(any(), any())).thenReturn(0.001); - when(cpuUsage2.getReduceByFor(any(), any())).thenReturn(0.001); - when(memoryUsage2.getReduceByFor(any(), any())).thenReturn(0.0); - when(memoryUsage1.getReduceByFor(any(), any())).thenReturn(0.0); + when(memoryUsageUtil.getExcessUsage(queryGroup2, memoryUsage2, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup2, cpuUsage2, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(queryGroup2, cpuUsage2, workloadManagementSettings)).thenReturn(true); QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); when(mockView1.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage1, ResourceType.MEMORY, memoryUsage1)); @@ -489,13 +491,14 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { queryGroupLevelViews.put(queryGroupId2, mockView); Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); - TestTaskCancellationImpl taskCancellation = new TestTaskCancellationImpl( + TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new DefaultTaskSelectionStrategy(), + new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> true + () -> true, + resourceUsageUtilFactory ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -517,8 +520,8 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage queryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + double queryGroupCpuUsage = 0.11; + double queryGroupMemoryUsage = 0.0; Double threshold = 0.1; QueryGroup queryGroup1 = new QueryGroup( @@ -527,10 +530,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(queryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.0); - when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.0); + QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn( Map.of(ResourceType.CPU, queryGroupCpuUsage, ResourceType.MEMORY, queryGroupMemoryUsage) @@ -544,8 +544,8 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage queryGroupCpuUsage = mock(QueryGroupCpuUsage.class); - QueryGroupMemoryUsage queryGroupMemoryUsage = mock(QueryGroupMemoryUsage.class); + double cpuUsage = 0.11; + double memoryUsage = 0.0; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -554,15 +554,13 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(queryGroupCpuUsage.isBreachingThresholdFor(any(), any())).thenReturn(true); - when(queryGroupCpuUsage.getReduceByFor(any(), any())).thenReturn(0.005); - when(queryGroupMemoryUsage.isBreachingThresholdFor(any(), any())).thenReturn(false); - when(queryGroupMemoryUsage.getReduceByFor(any(), any())).thenReturn(0.0); + + when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); + when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); + when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(true); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); - when(mockView.getResourceUsageData()).thenReturn( - Map.of(ResourceType.CPU, queryGroupCpuUsage, ResourceType.MEMORY, queryGroupMemoryUsage) - ); + when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); @@ -574,7 +572,7 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFound() { ResourceType resourceType = ResourceType.CPU; - QueryGroupCpuUsage usage = mock(QueryGroupCpuUsage.class); + double usage = 0.11; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( @@ -605,11 +603,7 @@ private QueryGroupLevelResourceUsageView createResourceUsageViewMock() { return mockView; } - private QueryGroupLevelResourceUsageView createResourceUsageViewMock( - ResourceType resourceType, - QueryGroupResourceUsage usage, - Collection ids - ) { + private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceType resourceType, double usage, Collection ids) { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getResourceUsageData()).thenReturn(Collections.singletonMap(resourceType, usage)); when(mockView.getActiveTasks()).thenReturn(ids.stream().map(this::getRandomSearchTask).collect(Collectors.toList())); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java b/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java deleted file mode 100644 index 16c47e956a841..0000000000000 --- a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTests.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.core.tasks.resourcetracker.ResourceStats; -import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.MutableQueryGroupFragment; -import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupCpuUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsage.QueryGroupMemoryUsage; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; - -import java.util.List; -import java.util.Map; - -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class QueryGroupResourceUsageTests extends OpenSearchTestCase { - QueryGroupResourceUsage sut; - WorkloadManagementSettings settings; - - public void testFactoryMethods() { - assertTrue(QueryGroupResourceUsage.from(ResourceType.CPU) instanceof QueryGroupCpuUsage); - assertTrue(QueryGroupResourceUsage.from(ResourceType.MEMORY) instanceof QueryGroupMemoryUsage); - assertThrows(IllegalArgumentException.class, () -> QueryGroupResourceUsage.from(null)); - } - - public void testQueryGroupCpuUsage() { - sut = new QueryGroupCpuUsage(); - TestClock clock = new TestClock(); - long fastForwardTime = PROCESSOR_COUNT * 200L; - clock.fastForwardBy(fastForwardTime); - QueryGroup queryGroup = new QueryGroup( - "testQG", - new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT)) - ); - - sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123)), clock::getTime); - settings = mock(WorkloadManagementSettings.class); - when(settings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.90); - - double expectedNormalisedThreshold = 0.5 / PROCESSOR_COUNT * 0.9; - double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; - double expectedReduceBy = expectedQueryGroupCpuUsage - expectedNormalisedThreshold; - assertEquals(expectedNormalisedThreshold, sut.getNormalisedThresholdFor(queryGroup, settings), MIN_VALUE); - assertEquals(expectedQueryGroupCpuUsage, sut.getCurrentUsage(), MIN_VALUE); - assertTrue(sut.isBreachingThresholdFor(queryGroup, settings)); - assertEquals(expectedReduceBy, sut.getReduceByFor(queryGroup, settings), MIN_VALUE); - } - - public void testQueryGroupMemoryUsage() { - sut = new QueryGroupMemoryUsage(); - TestClock clock = new TestClock(); - QueryGroup queryGroup = new QueryGroup( - "testQG", - new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 500.0 / HEAP_SIZE_BYTES)) - ); - - sut.initialise(List.of(createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123)), clock::getTime); - settings = mock(WorkloadManagementSettings.class); - when(settings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.90); - - double expectedNormalisedThreshold = 500.0 / HEAP_SIZE_BYTES * 0.9; - double expectedCurrentUsage = 200.0 / HEAP_SIZE_BYTES; - assertEquals(expectedNormalisedThreshold, sut.getNormalisedThresholdFor(queryGroup, settings), MIN_VALUE); - assertEquals(expectedCurrentUsage, sut.getCurrentUsage(), MIN_VALUE); - assertFalse(sut.isBreachingThresholdFor(queryGroup, settings)); - } - - public static T createMockTaskWithResourceStats( - Class type, - long cpuUsage, - long heapUsage, - long startTimeNanos, - long taskId - ) { - T task = mock(type); - when(task.getTotalResourceUtilization(ResourceStats.CPU)).thenReturn(cpuUsage); - when(task.getTotalResourceUtilization(ResourceStats.MEMORY)).thenReturn(heapUsage); - when(task.getStartTimeNanos()).thenReturn(startTimeNanos); - when(task.getId()).thenReturn(taskId); - return task; - } -} diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java new file mode 100644 index 0000000000000..6e40500b4f1f6 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -0,0 +1,84 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.core.tasks.resourcetracker.ResourceStats; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.MutableQueryGroupFragment; +import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; +import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; + +import java.util.List; +import java.util.Map; + +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; +import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ResourceUsageCalculatorTests extends OpenSearchTestCase { + ResourceUsageCalculator sut; + + public void testFactoryMethods() { + ResourceUsageCalculatorFactory resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); + assertTrue(resourceUsageCalculatorFactory.getInstanceForResourceType(ResourceType.CPU) instanceof CpuUsageCalculator); + assertTrue(resourceUsageCalculatorFactory.getInstanceForResourceType(ResourceType.MEMORY) instanceof MemoryUsageCalculator); + assertThrows(IllegalArgumentException.class, () -> resourceUsageCalculatorFactory.getInstanceForResourceType(null)); + } + + public void testQueryGroupCpuUsage() { + sut = CpuUsageCalculator.getInstance(); + TestClock clock = new TestClock(); + long fastForwardTime = PROCESSOR_COUNT * 200L; + clock.fastForwardBy(fastForwardTime); + QueryGroup queryGroup = new QueryGroup( + "testQG", + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT)) + ); + double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; + + QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123); + double actualUsage = sut.calculateResourceUsage(List.of(mockTask), clock::getTime); + assertEquals(expectedQueryGroupCpuUsage, actualUsage, MIN_VALUE); + + double taskResourceUsage = sut.calculateTaskResourceUsage(mockTask, clock::getTime); + assertEquals(1.0, taskResourceUsage, MIN_VALUE); + } + + public void testQueryGroupMemoryUsage() { + sut = MemoryUsageCalculator.getInstance(); + TestClock clock = new TestClock(); + + QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123); + double actualMemoryUsage = sut.calculateResourceUsage(List.of(mockTask), clock::getTime); + double expectedMemoryUsage = 200.0 / HEAP_SIZE_BYTES; + + assertEquals(expectedMemoryUsage, actualMemoryUsage, MIN_VALUE); + assertEquals(200.0 / HEAP_SIZE_BYTES, sut.calculateTaskResourceUsage(mockTask, clock::getTime), MIN_VALUE); + } + + public static T createMockTaskWithResourceStats( + Class type, + long cpuUsage, + long heapUsage, + long startTimeNanos, + long taskId + ) { + T task = mock(type); + when(task.getTotalResourceUtilization(ResourceStats.CPU)).thenReturn(cpuUsage); + when(task.getTotalResourceUtilization(ResourceStats.MEMORY)).thenReturn(heapUsage); + when(task.getStartTimeNanos()).thenReturn(startTimeNanos); + when(task.getId()).thenReturn(taskId); + return task; + } +} diff --git a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java similarity index 86% rename from server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java rename to server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java index 4e59e88de9baa..4f5c5655a8bc7 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java @@ -21,6 +21,7 @@ import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; import org.junit.After; import org.junit.Before; @@ -31,18 +32,22 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.opensearch.wlm.QueryGroupTask.QUERY_GROUP_ID_HEADER; -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.PROCESSOR_COUNT; +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; +import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class QueryGroupResourceUsageTrackerServiceTests extends OpenSearchTestCase { +public class ResourceUsageCalculatorTrackerServiceTests extends OpenSearchTestCase { TestThreadPool threadPool; TaskResourceTrackingService mockTaskResourceTrackingService; QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService; + WorkloadManagementSettings settings; + ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; + CpuUsageCalculator cpuUsageCalculator; + MemoryUsageCalculator memoryUsageCalculator; public static class TestClock { long time; @@ -61,9 +66,15 @@ public long getTime() { @Before public void setup() { clock = new TestClock(); + settings = mock(WorkloadManagementSettings.class); threadPool = new TestThreadPool(getTestName()); mockTaskResourceTrackingService = mock(TaskResourceTrackingService.class); - queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService, clock::getTime); + resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); + queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( + mockTaskResourceTrackingService, + clock::getTime, + resourceUsageCalculatorFactory + ); } @After @@ -77,21 +88,19 @@ public void testConstructQueryGroupLevelViews_CreatesQueryGroupLevelUsageView_Wh Map activeSearchShardTasks = createActiveSearchShardTasks(queryGroupIds); when(mockTaskResourceTrackingService.getResourceAwareTasks()).thenReturn(activeSearchShardTasks); clock.fastForwardBy(2000); + Map stringQueryGroupLevelResourceUsageViewMap = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); for (String queryGroupId : queryGroupIds) { assertEquals( (400 * 1.0f) / HEAP_SIZE_BYTES, - stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId) - .getResourceUsageData() - .get(ResourceType.MEMORY) - .getCurrentUsage(), + stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.MEMORY), MIN_VALUE ); assertEquals( (200 * 1.0f) / (PROCESSOR_COUNT * 2000), - stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), + stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getResourceUsageData().get(ResourceType.CPU), MIN_VALUE ); assertEquals(2, stringQueryGroupLevelResourceUsageViewMap.get(queryGroupId).getActiveTasks().size()); @@ -115,12 +124,12 @@ public void testConstructQueryGroupLevelUsageViews_WithTasksHavingDifferentResou assertEquals( (double) 600 / HEAP_SIZE_BYTES, - queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY).getCurrentUsage(), + queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.MEMORY), MIN_VALUE ); assertEquals( ((double) 300) / (PROCESSOR_COUNT * 2000), - queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.CPU).getCurrentUsage(), + queryGroupViews.get("queryGroup1").getResourceUsageData().get(ResourceType.CPU), MIN_VALUE ); assertEquals(2, queryGroupViews.get("queryGroup1").getActiveTasks().size()); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java new file mode 100644 index 0000000000000..943a15825ea63 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm.tracker; + +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.wlm.MutableQueryGroupFragment; +import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.ResourceUsageUtil.CpuUsageUtil; +import org.opensearch.wlm.tracker.ResourceUsageUtil.MemoryUsageUtil; + +import java.util.Map; + +import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ResourceUsageUtilTests extends OpenSearchTestCase { + ResourceUsageUtil sut; + ResourceUsageUtilFactory resourceUsageUtilFactory; + WorkloadManagementSettings settings; + + public void setUp() throws Exception { + super.setUp(); + resourceUsageUtilFactory = ResourceUsageUtilFactory.getInstance(); + settings = mock(WorkloadManagementSettings.class); + } + + public void testFactoryClass() { + assertTrue(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU) instanceof CpuUsageUtil); + assertTrue(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY) instanceof MemoryUsageUtil); + assertThrows(IllegalArgumentException.class, () -> resourceUsageUtilFactory.getInstanceForResourceType(null)); + } + + public void testCpuUsageUtil() { + sut = resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU); + QueryGroup queryGroup = new QueryGroup( + "testQG", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.50)) + ); + when(settings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); + + assertExpectedValues(queryGroup); + } + + public void testMemoryUsageUtil() { + sut = resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY); + QueryGroup queryGroup = new QueryGroup( + "testQG", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.50)) + ); + when(settings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); + assertExpectedValues(queryGroup); + } + + private void assertExpectedValues(QueryGroup queryGroup) { + double normalisedThreshold = sut.getNormalisedThreshold(queryGroup, settings); + assertEquals(0.45, normalisedThreshold, MIN_VALUE); + + assertEquals(0.1, sut.getExcessUsage(queryGroup, 0.55, settings), MIN_VALUE); + assertTrue(sut.isBreachingThresholdFor(queryGroup, 0.55, settings)); + } +} diff --git a/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java deleted file mode 100644 index c8e7209dff20d..0000000000000 --- a/server/src/test/java/org/opensearch/wlm/tracker/TaskResourceUsageCalculatorTests.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.QueryGroupTask; -import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerServiceTests.TestClock; -import org.opensearch.wlm.tracker.TaskResourceUsageCalculator.TaskCpuUsageCalculator; -import org.opensearch.wlm.tracker.TaskResourceUsageCalculator.TaskMemoryUsageCalculator; - -import static org.opensearch.wlm.cancellation.DefaultTaskCancellation.MIN_VALUE; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTests.createMockTaskWithResourceStats; -import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.HEAP_SIZE_BYTES; - -public class TaskResourceUsageCalculatorTests extends OpenSearchTestCase { - TaskResourceUsageCalculator sut; - - public void testFactoryMethod() { - assertTrue(TaskResourceUsageCalculator.from(ResourceType.CPU) instanceof TaskCpuUsageCalculator); - assertTrue(TaskResourceUsageCalculator.from(ResourceType.MEMORY) instanceof TaskMemoryUsageCalculator); - assertThrows(IllegalArgumentException.class, () -> TaskMemoryUsageCalculator.from(null)); - } - - public void testTaskCpuUsageCalculator() { - sut = new TaskCpuUsageCalculator(); - TestClock clock = new TestClock(); - QueryGroupTask task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1); - clock.fastForwardBy(200); - - double expectedUsage = 0.5; - double actualUsage = sut.calculateFor(task, clock::getTime); - assertEquals(expectedUsage, actualUsage, MIN_VALUE); - } - - public void testTaskMemoryUsageCalculator() { - sut = new TaskMemoryUsageCalculator(); - TestClock clock = new TestClock(); - QueryGroupTask task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 1); - clock.fastForwardBy(200); - - double expectedUsage = 200.0 / HEAP_SIZE_BYTES; - double actualUsage = sut.calculateFor(task, clock::getTime); - assertEquals(expectedUsage, actualUsage, MIN_VALUE); - } -} From 3fc21bece0223d65836268bbffb4e73d3cc89293 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 5 Sep 2024 14:02:46 -0700 Subject: [PATCH 21/47] fix javadocs Signed-off-by: Kaushal Kumar --- .../java/org/opensearch/wlm/tracker/ResourceUsageUtil.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java index d616dc96088a0..c647273306290 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java @@ -42,6 +42,9 @@ public double getExcessUsage(QueryGroup queryGroup, double currentUsage, Workloa */ protected abstract double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings); + /** + * Utility class to provide query group level helper methods for CPU resource type + */ public static class CpuUsageUtil extends ResourceUsageUtil { private static final CpuUsageUtil instance = new CpuUsageUtil(); @@ -57,6 +60,9 @@ protected double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagemen } } + /** + * Utility class to provide query group level helper methods for Memory Resource + */ public static class MemoryUsageUtil extends ResourceUsageUtil { private static final MemoryUsageUtil instance = new MemoryUsageUtil(); From fe02a6a32304d79820716304151bed889bb542bb Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 5 Sep 2024 17:40:52 -0700 Subject: [PATCH 22/47] remove code clutter Signed-off-by: Kaushal Kumar --- .../cluster/metadata/QueryGroup.java | 3 +- .../org/opensearch/wlm/QueryGroupTask.java | 2 + .../java/org/opensearch/wlm/ResourceType.java | 46 ++++++++- .../wlm/WorkloadManagementSettings.java | 2 + ...gestTaskRunningFirstSelectionStrategy.java | 13 +-- .../wlm/cancellation/TaskCanceller.java | 15 +-- .../wlm/tracker/CpuUsageCalculator.java | 10 +- .../wlm/tracker/MemoryUsageCalculator.java | 6 +- ...QueryGroupResourceUsageTrackerService.java | 13 +-- .../ResourceUsageCalculatorFactory.java | 33 ------- .../wlm/tracker/ResourceUsageUtil.java | 61 ++++-------- .../wlm/tracker/ResourceUsageUtilFactory.java | 33 ------- ...askRunningFirstSelectionStrategyTests.java | 8 +- .../wlm/cancellation/TaskCancellerTests.java | 94 +++---------------- .../tracker/ResourceUsageCalculatorTests.java | 18 +--- ...rceUsageCalculatorTrackerServiceTests.java | 10 +- .../wlm/tracker/ResourceUsageUtilTests.java | 70 -------------- 17 files changed, 100 insertions(+), 337 deletions(-) delete mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java delete mode 100644 server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java delete mode 100644 server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java diff --git a/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java b/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java index dcd96dceb4bf1..94afd3f12fa93 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java @@ -12,6 +12,7 @@ import org.opensearch.cluster.Diff; import org.opensearch.common.UUIDs; import org.opensearch.common.annotation.ExperimentalApi; +import org.opensearch.common.annotation.PublicApi; import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.core.xcontent.ToXContentObject; @@ -41,7 +42,7 @@ * "updated_at": 4513232415 * } */ -@ExperimentalApi +@PublicApi(since = "2.17.0") public class QueryGroup extends AbstractDiffable implements ToXContentObject { public static final String _ID_STRING = "_id"; diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index 4eb413be61b72..1fb9a0b58c8de 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.opensearch.common.annotation.PublicApi; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.concurrent.ThreadContext; import org.opensearch.core.tasks.TaskId; @@ -24,6 +25,7 @@ /** * Base class to define QueryGroup tasks */ +@PublicApi(since = "2.17.0") public class QueryGroupTask extends CancellableTask { private static final Logger logger = LogManager.getLogger(QueryGroupTask.class); diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index 3fde1bbf1fee5..b2a75a60e23b5 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -8,11 +8,17 @@ package org.opensearch.wlm; +import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.common.annotation.PublicApi; import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.wlm.tracker.CpuUsageCalculator; +import org.opensearch.wlm.tracker.MemoryUsageCalculator; +import org.opensearch.wlm.tracker.ResourceUsageCalculator; +import org.opensearch.wlm.tracker.ResourceUsageUtil; import java.io.IOException; import java.util.List; +import java.util.function.Supplier; /** * Enum to hold the resource type @@ -21,16 +27,30 @@ */ @PublicApi(since = "2.17.0") public enum ResourceType { - CPU("cpu", true), - MEMORY("memory", true); + CPU("cpu", true, CpuUsageCalculator.INSTANCE, new ResourceUsageUtil() { + @Override + protected double getNormalisedThreshold(QueryGroup queryGroup) { + return queryGroup.getResourceLimits().get(ResourceType.CPU) * getSettings().getNodeLevelCpuCancellationThreshold(); + } + }), + MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new ResourceUsageUtil() { + @Override + protected double getNormalisedThreshold(QueryGroup queryGroup) { + return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * getSettings().getNodeLevelMemoryCancellationThreshold(); + } + }); private final String name; private final boolean statsEnabled; + private final ResourceUsageCalculator resourceUsageCalculator; + private final ResourceUsageUtil resourceUsageUtil; private static List sortedValues = List.of(CPU, MEMORY); - ResourceType(String name, boolean statsEnabled) { + ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, ResourceUsageUtil resourceUsageUtil) { this.name = name; this.statsEnabled = statsEnabled; + this.resourceUsageCalculator = resourceUsageCalculator; + this.resourceUsageUtil = resourceUsageUtil; } /** @@ -59,6 +79,26 @@ public boolean hasStatsEnabled() { return statsEnabled; } + public double calculateQueryGroupUsage(List tasks, Supplier nanoTimeSupplier) { + return resourceUsageCalculator.calculateResourceUsage(tasks, nanoTimeSupplier); + } + + public double calculateTaskUsage(QueryGroupTask task, Supplier nanoTimeSupplier) { + return resourceUsageCalculator.calculateTaskResourceUsage(task, nanoTimeSupplier); + } + + public boolean isBreachingThreshold(QueryGroup queryGroup, double currentUsage) { + return getExcessUsage(queryGroup, currentUsage) > 0; + } + + public double getExcessUsage(QueryGroup queryGroup, double currentUsage) { + return resourceUsageUtil.getExcessUsage(queryGroup, currentUsage); + } + + public void setWorkloadManagementSettings(WorkloadManagementSettings settings) { + resourceUsageUtil.setSettings(settings); + } + public static List getSortedValues() { return sortedValues; } diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index b104925df77b3..e9240f5517e85 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -8,6 +8,7 @@ package org.opensearch.wlm; +import org.opensearch.common.annotation.PublicApi; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; @@ -15,6 +16,7 @@ /** * Main class to declare Workload Management related settings */ +@PublicApi(since = "2.17.0") public class WorkloadManagementSettings { private static final Double DEFAULT_NODE_LEVEL_MEMORY_REJECTION_THRESHOLD = 0.8; private static final Double DEFAULT_NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD = 0.9; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java index 26669fc0d84e6..f05149c935e3f 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java @@ -10,7 +10,6 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.ResourceUsageCalculatorFactory; import java.util.ArrayList; import java.util.Collections; @@ -27,18 +26,13 @@ public class LongestTaskRunningFirstSelectionStrategy implements TaskSelectionStrategy { private final Supplier nanoTimeSupplier; - private final ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; public LongestTaskRunningFirstSelectionStrategy() { - this(System::nanoTime, ResourceUsageCalculatorFactory.getInstance()); + this(System::nanoTime); } - public LongestTaskRunningFirstSelectionStrategy( - Supplier nanoTimeSupplier, - ResourceUsageCalculatorFactory resourceUsageCalculatorFactory - ) { + public LongestTaskRunningFirstSelectionStrategy(Supplier nanoTimeSupplier) { this.nanoTimeSupplier = nanoTimeSupplier; - this.resourceUsageCalculatorFactory = resourceUsageCalculatorFactory; } /** @@ -75,8 +69,7 @@ public List selectTasksForCancellation(List task double accumulated = 0; for (QueryGroupTask task : sortedTasks) { selectedTasks.add(task); - accumulated += resourceUsageCalculatorFactory.getInstanceForResourceType(resourceType) - .calculateTaskResourceUsage(task, nanoTimeSupplier); + accumulated += resourceType.calculateTaskUsage(task, nanoTimeSupplier); if ((accumulated - limit) > MIN_VALUE) { break; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index 4ed7148b0f4fa..3caa5c0cc062e 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -16,8 +16,6 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.ResourceUsageUtil; -import org.opensearch.wlm.tracker.ResourceUsageUtilFactory; import java.util.ArrayList; import java.util.Collection; @@ -56,7 +54,6 @@ public class TaskCanceller { protected final Collection activeQueryGroups; protected final Collection deletedQueryGroups; protected BooleanSupplier isNodeInDuress; - private final ResourceUsageUtilFactory resourceUsageUtilFactory; public TaskCanceller( WorkloadManagementSettings workloadManagementSettings, @@ -64,8 +61,7 @@ public TaskCanceller( Map queryGroupLevelResourceUsageViews, Collection activeQueryGroups, Collection deletedQueryGroups, - BooleanSupplier isNodeInDuress, - ResourceUsageUtilFactory resourceUsageUtilFactory + BooleanSupplier isNodeInDuress ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; @@ -73,7 +69,7 @@ public TaskCanceller( this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; - this.resourceUsageUtilFactory = resourceUsageUtilFactory; + TRACKED_RESOURCES.forEach(resourceType -> resourceType.setWorkloadManagementSettings(workloadManagementSettings)); } /** @@ -141,8 +137,7 @@ private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMod for (ResourceType resourceType : TRACKED_RESOURCES) { if (queryGroup.getResourceLimits().containsKey(resourceType)) { final double currentUsage = queryGroupResourcesUsage.get(resourceType); - final ResourceUsageUtil resourceUsageUtil = resourceUsageUtilFactory.getInstanceForResourceType(resourceType); - if (resourceUsageUtil.isBreachingThresholdFor(queryGroup, currentUsage, workloadManagementSettings)) { + if (resourceType.isBreachingThreshold(queryGroup, currentUsage)) { queryGroupsToCancelFrom.add(queryGroup); break; } @@ -235,9 +230,7 @@ private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) final QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); final double currentUsage = queryGroupResourceUsageView.getResourceUsageData().get(resourceType); - final ResourceUsageUtil resourceUsageUtil = resourceUsageUtilFactory.getInstanceForResourceType(resourceType); - - return resourceUsageUtil.getExcessUsage(queryGroup, currentUsage, workloadManagementSettings); + return resourceType.getExcessUsage(queryGroup, currentUsage); } private void callbackOnCancel() { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java index 95c9dfa3e2ca1..3cbab3db010c6 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java @@ -20,18 +20,10 @@ public class CpuUsageCalculator implements ResourceUsageCalculator { // This value should be initialised at the start time of the process and be used throughout the codebase public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); - private static final CpuUsageCalculator instance = new CpuUsageCalculator(); + public static final CpuUsageCalculator INSTANCE = new CpuUsageCalculator(); private CpuUsageCalculator() {} - /** - * static method to access the singleton - * @return eager singleton object of the class - */ - public static CpuUsageCalculator getInstance() { - return instance; - } - @Override public double calculateResourceUsage(List tasks, Supplier timeSupplier) { double usage = tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); diff --git a/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java index e1ac592360dd3..dd72ac9007705 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java @@ -20,14 +20,10 @@ */ public class MemoryUsageCalculator implements ResourceUsageCalculator { public static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); - private static final MemoryUsageCalculator instance = new MemoryUsageCalculator(); + public static final MemoryUsageCalculator INSTANCE = new MemoryUsageCalculator(); private MemoryUsageCalculator() {} - public static MemoryUsageCalculator getInstance() { - return instance; - } - @Override public double calculateResourceUsage(List tasks, Supplier timeSupplier) { return tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index dc3dde0c78886..0e7fab599dfcb 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -27,21 +27,15 @@ public class QueryGroupResourceUsageTrackerService { public static final EnumSet TRACKED_RESOURCES = EnumSet.allOf(ResourceType.class); private final TaskResourceTrackingService taskResourceTrackingService; private final Supplier nanoTimeSupplier; - private final ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; /** * QueryGroupResourceTrackerService constructor * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService( - TaskResourceTrackingService taskResourceTrackingService, - Supplier nanoTimeSupplier, - ResourceUsageCalculatorFactory resourceUsageCalculatorFactory - ) { + public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, Supplier nanoTimeSupplier) { this.taskResourceTrackingService = taskResourceTrackingService; this.nanoTimeSupplier = nanoTimeSupplier; - this.resourceUsageCalculatorFactory = resourceUsageCalculatorFactory; } /** @@ -58,10 +52,7 @@ public Map constructQueryGroupLevelUsa // Compute the QueryGroup resource usage final Map resourceUsage = new HashMap<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - final ResourceUsageCalculator resourceUsageCalculator = resourceUsageCalculatorFactory.getInstanceForResourceType( - resourceType - ); - double usage = resourceUsageCalculator.calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); + double usage = resourceType.calculateQueryGroupUsage(queryGroupEntry.getValue(), nanoTimeSupplier); resourceUsage.put(resourceType, usage); } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java deleted file mode 100644 index c0e9c285ccfa8..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorFactory.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.wlm.ResourceType; - -/** - * Factory class for {@link ResourceUsageCalculator} singleton implementations - */ -public class ResourceUsageCalculatorFactory { - private static ResourceUsageCalculatorFactory instance = new ResourceUsageCalculatorFactory(); - - private ResourceUsageCalculatorFactory() {} - - public static ResourceUsageCalculatorFactory getInstance() { - return instance; - } - - public ResourceUsageCalculator getInstanceForResourceType(ResourceType type) { - if (type == ResourceType.CPU) { - return CpuUsageCalculator.getInstance(); - } else if (type == ResourceType.MEMORY) { - return MemoryUsageCalculator.getInstance(); - } - throw new IllegalArgumentException(type + " is an invalid resource type"); - } -} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java index c647273306290..dc23914ffa6bb 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java @@ -9,21 +9,34 @@ package org.opensearch.wlm.tracker; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; /** * Utility class to provide utility methods at query group level */ public abstract class ResourceUsageUtil { + private WorkloadManagementSettings settings; + + public WorkloadManagementSettings getSettings() { + return settings; + } + + /** + * WorkloadManagementSettings setter + * @param settings + */ + public void setSettings(WorkloadManagementSettings settings) { + this.settings = settings; + } + /** * Determines whether {@link QueryGroup} is breaching its threshold for the resource * @param queryGroup * @param currentUsage * @return whether the query group is breaching threshold for this resource */ - public boolean isBreachingThresholdFor(QueryGroup queryGroup, double currentUsage, WorkloadManagementSettings settings) { - return getExcessUsage(queryGroup, currentUsage, settings) > 0; + public boolean isBreachingThresholdFor(QueryGroup queryGroup, double currentUsage) { + return getExcessUsage(queryGroup, currentUsage) > 0; } /** @@ -31,8 +44,8 @@ public boolean isBreachingThresholdFor(QueryGroup queryGroup, double currentUsag * @param queryGroup instance * @return the overshooting limit for the resource */ - public double getExcessUsage(QueryGroup queryGroup, double currentUsage, WorkloadManagementSettings settings) { - return currentUsage - getNormalisedThreshold(queryGroup, settings); + public double getExcessUsage(QueryGroup queryGroup, double currentUsage) { + return currentUsage - getNormalisedThreshold(queryGroup); } /** @@ -40,41 +53,5 @@ public double getExcessUsage(QueryGroup queryGroup, double currentUsage, Workloa * @param queryGroup instance * @return normalised value with respect to node level cancellation thresholds */ - protected abstract double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings); - - /** - * Utility class to provide query group level helper methods for CPU resource type - */ - public static class CpuUsageUtil extends ResourceUsageUtil { - private static final CpuUsageUtil instance = new CpuUsageUtil(); - - private CpuUsageUtil() {} - - public static CpuUsageUtil getInstance() { - return instance; - } - - @Override - protected double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return queryGroup.getResourceLimits().get(ResourceType.CPU) * settings.getNodeLevelCpuCancellationThreshold(); - } - } - - /** - * Utility class to provide query group level helper methods for Memory Resource - */ - public static class MemoryUsageUtil extends ResourceUsageUtil { - private static final MemoryUsageUtil instance = new MemoryUsageUtil(); - - private MemoryUsageUtil() {} - - public static MemoryUsageUtil getInstance() { - return instance; - } - - @Override - public double getNormalisedThreshold(QueryGroup queryGroup, WorkloadManagementSettings settings) { - return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * settings.getNodeLevelMemoryCancellationThreshold(); - } - } + protected abstract double getNormalisedThreshold(QueryGroup queryGroup); } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java deleted file mode 100644 index df68cdec30a6a..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtilFactory.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.wlm.ResourceType; - -/** - * Factory class for {@link ResourceUsageUtil} implementations - */ -public class ResourceUsageUtilFactory { - private static ResourceUsageUtilFactory instance = new ResourceUsageUtilFactory(); - - private ResourceUsageUtilFactory() {} - - public static ResourceUsageUtilFactory getInstance() { - return instance; - } - - public ResourceUsageUtil getInstanceForResourceType(ResourceType type) { - if (type == ResourceType.CPU) { - return ResourceUsageUtil.CpuUsageUtil.getInstance(); - } else if (type == ResourceType.MEMORY) { - return ResourceUsageUtil.MemoryUsageUtil.getInstance(); - } - throw new IllegalArgumentException(type + " is an invalid resource type"); - } -} diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java index 2fcdc3121c2bf..a93b1acf67fa5 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java @@ -17,8 +17,6 @@ import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.MemoryUsageCalculator; -import org.opensearch.wlm.tracker.ResourceUsageCalculatorFactory; import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import java.util.ArrayList; @@ -30,13 +28,11 @@ public class LongestTaskRunningFirstSelectionStrategyTests extends OpenSearchTestCase { private TestClock clock; - private ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { clock = new TestClock(); - resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = - new LongestTaskRunningFirstSelectionStrategy(clock::getTime, resourceUsageCalculatorFactory); + new LongestTaskRunningFirstSelectionStrategy(clock::getTime); long thresholdInLong = 100L; double reduceBy = 50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; @@ -83,7 +79,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua private boolean tasksUsageMeetsThreshold(List selectedTasks, double threshold) { double memory = 0; for (QueryGroupTask task : selectedTasks) { - memory += MemoryUsageCalculator.getInstance().calculateTaskResourceUsage(task, clock::getTime); + memory += ResourceType.MEMORY.calculateTaskUsage(task, clock::getTime); if ((memory - threshold) > MIN_VALUE) { return true; } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java index b3ed024a04ad9..26fccc807aa5f 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java @@ -21,9 +21,6 @@ import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; -import org.opensearch.wlm.tracker.ResourceUsageUtil.CpuUsageUtil; -import org.opensearch.wlm.tracker.ResourceUsageUtil.MemoryUsageUtil; -import org.opensearch.wlm.tracker.ResourceUsageUtilFactory; import org.junit.Before; import java.util.Collection; @@ -44,7 +41,6 @@ public class TaskCancellerTests extends OpenSearchTestCase { private static final String queryGroupId2 = "queryGroup2"; private TestClock clock; - private ResourceUsageUtilFactory resourceUsageUtilFactory; private static class TestTaskCancellerImpl extends TaskCanceller { @@ -54,8 +50,7 @@ public TestTaskCancellerImpl( Map queryGroupLevelViews, Set activeQueryGroups, Set deletedQueryGroups, - BooleanSupplier isNodeInDuress, - ResourceUsageUtilFactory resourceUsageUtilFactory + BooleanSupplier isNodeInDuress ) { super( workloadManagementSettings, @@ -63,8 +58,7 @@ public TestTaskCancellerImpl( queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - isNodeInDuress, - resourceUsageUtilFactory + isNodeInDuress ); } } @@ -73,8 +67,6 @@ public TestTaskCancellerImpl( private Set activeQueryGroups; private Set deletedQueryGroups; private TaskCanceller taskCancellation; - private CpuUsageUtil cpuUsageUtil; - private MemoryUsageUtil memoryUsageUtil; private WorkloadManagementSettings workloadManagementSettings; @Before @@ -83,21 +75,17 @@ public void setup() { queryGroupLevelViews = new HashMap<>(); activeQueryGroups = new HashSet<>(); deletedQueryGroups = new HashSet<>(); - resourceUsageUtilFactory = mock(ResourceUsageUtilFactory.class); - cpuUsageUtil = mock(CpuUsageUtil.class); - memoryUsageUtil = mock(MemoryUsageUtil.class); - when(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU)).thenReturn(cpuUsageUtil); - when(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY)).thenReturn(memoryUsageUtil); clock = new TestClock(); + when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); + when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, new LongestTaskRunningFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false, - resourceUsageUtilFactory + () -> false ); } @@ -115,9 +103,6 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco ); clock.fastForwardBy(1000); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); @@ -149,9 +134,6 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); @@ -175,11 +157,6 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.01); - when(memoryUsageUtil.isBreachingThresholdFor(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(true); - when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(false); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.0); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, resourceType, memoryUsage)); @@ -194,7 +171,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold() { ResourceType resourceType = ResourceType.CPU; - double cpuUsage = 0.91; + double cpuUsage = 0.81; double memoryUsage = 0.0; Double threshold = 0.9; QueryGroup queryGroup1 = new QueryGroup( @@ -203,8 +180,6 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.0); QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); @@ -237,8 +212,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false, - resourceUsageUtilFactory + () -> false ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); @@ -259,10 +233,6 @@ public void testCancelTasks_cancelsGivenTasks() { 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(true); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); @@ -275,8 +245,7 @@ public void testCancelTasks_cancelsGivenTasks() { queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false, - resourceUsageUtilFactory + () -> false ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -291,9 +260,9 @@ public void testCancelTasks_cancelsGivenTasks() { public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { ResourceType resourceType = ResourceType.CPU; - double activeQueryGroupCpuUsage = 0.0; + double activeQueryGroupCpuUsage = 0.01; double activeQueryGroupMemoryUsage = 0.0; - double deletedQueryGroupCpuUsage = 0.011; + double deletedQueryGroupCpuUsage = 0.01; double deletedQueryGroupMemoryUsage = 0.0; Double threshold = 0.01; @@ -311,16 +280,6 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { 1L ); - when(memoryUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn( - true - ); - - when(memoryUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(true); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( resourceType, @@ -346,8 +305,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> true, - resourceUsageUtilFactory + () -> true ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -393,16 +351,6 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN 1L ); - when(memoryUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(deletedQueryGroup, deletedQueryGroupCpuUsage, workloadManagementSettings)).thenReturn( - true - ); - - when(memoryUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupMemoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(activeQueryGroup, activeQueryGroupCpuUsage, workloadManagementSettings)).thenReturn(true); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); QueryGroupLevelResourceUsageView mockView2 = createResourceUsageViewMock( resourceType, @@ -428,8 +376,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> false, - resourceUsageUtilFactory + () -> false ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -474,14 +421,6 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage1, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage1, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage1, workloadManagementSettings)).thenReturn(true); - - when(memoryUsageUtil.getExcessUsage(queryGroup2, memoryUsage2, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup2, cpuUsage2, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(queryGroup2, cpuUsage2, workloadManagementSettings)).thenReturn(true); - QueryGroupLevelResourceUsageView mockView1 = createResourceUsageViewMock(); when(mockView1.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage1, ResourceType.MEMORY, memoryUsage1)); queryGroupLevelViews.put(queryGroupId1, mockView1); @@ -497,8 +436,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, - () -> true, - resourceUsageUtilFactory + () -> true ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -520,7 +458,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds() { ResourceType resourceType = ResourceType.CPU; - double queryGroupCpuUsage = 0.11; + double queryGroupCpuUsage = 0.09; double queryGroupMemoryUsage = 0.0; Double threshold = 0.1; @@ -555,10 +493,6 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { 1L ); - when(memoryUsageUtil.getExcessUsage(queryGroup1, memoryUsage, workloadManagementSettings)).thenReturn(0.0); - when(cpuUsageUtil.getExcessUsage(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(0.01); - when(cpuUsageUtil.isBreachingThresholdFor(queryGroup1, cpuUsage, workloadManagementSettings)).thenReturn(true); - QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index 6e40500b4f1f6..b0083d7b9e1e5 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -27,17 +27,8 @@ import static org.mockito.Mockito.when; public class ResourceUsageCalculatorTests extends OpenSearchTestCase { - ResourceUsageCalculator sut; - - public void testFactoryMethods() { - ResourceUsageCalculatorFactory resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); - assertTrue(resourceUsageCalculatorFactory.getInstanceForResourceType(ResourceType.CPU) instanceof CpuUsageCalculator); - assertTrue(resourceUsageCalculatorFactory.getInstanceForResourceType(ResourceType.MEMORY) instanceof MemoryUsageCalculator); - assertThrows(IllegalArgumentException.class, () -> resourceUsageCalculatorFactory.getInstanceForResourceType(null)); - } public void testQueryGroupCpuUsage() { - sut = CpuUsageCalculator.getInstance(); TestClock clock = new TestClock(); long fastForwardTime = PROCESSOR_COUNT * 200L; clock.fastForwardBy(fastForwardTime); @@ -48,23 +39,22 @@ public void testQueryGroupCpuUsage() { double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123); - double actualUsage = sut.calculateResourceUsage(List.of(mockTask), clock::getTime); + double actualUsage = ResourceType.CPU.calculateQueryGroupUsage(List.of(mockTask), clock::getTime); assertEquals(expectedQueryGroupCpuUsage, actualUsage, MIN_VALUE); - double taskResourceUsage = sut.calculateTaskResourceUsage(mockTask, clock::getTime); + double taskResourceUsage = ResourceType.CPU.calculateTaskUsage(mockTask, clock::getTime); assertEquals(1.0, taskResourceUsage, MIN_VALUE); } public void testQueryGroupMemoryUsage() { - sut = MemoryUsageCalculator.getInstance(); TestClock clock = new TestClock(); QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123); - double actualMemoryUsage = sut.calculateResourceUsage(List.of(mockTask), clock::getTime); + double actualMemoryUsage = ResourceType.MEMORY.calculateQueryGroupUsage(List.of(mockTask), clock::getTime); double expectedMemoryUsage = 200.0 / HEAP_SIZE_BYTES; assertEquals(expectedMemoryUsage, actualMemoryUsage, MIN_VALUE); - assertEquals(200.0 / HEAP_SIZE_BYTES, sut.calculateTaskResourceUsage(mockTask, clock::getTime), MIN_VALUE); + assertEquals(200.0 / HEAP_SIZE_BYTES, ResourceType.MEMORY.calculateTaskUsage(mockTask, clock::getTime), MIN_VALUE); } public static T createMockTaskWithResourceStats( diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java index 4f5c5655a8bc7..d109f37243ee2 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java @@ -45,9 +45,6 @@ public class ResourceUsageCalculatorTrackerServiceTests extends OpenSearchTestCa TaskResourceTrackingService mockTaskResourceTrackingService; QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService; WorkloadManagementSettings settings; - ResourceUsageCalculatorFactory resourceUsageCalculatorFactory; - CpuUsageCalculator cpuUsageCalculator; - MemoryUsageCalculator memoryUsageCalculator; public static class TestClock { long time; @@ -69,12 +66,7 @@ public void setup() { settings = mock(WorkloadManagementSettings.class); threadPool = new TestThreadPool(getTestName()); mockTaskResourceTrackingService = mock(TaskResourceTrackingService.class); - resourceUsageCalculatorFactory = ResourceUsageCalculatorFactory.getInstance(); - queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( - mockTaskResourceTrackingService, - clock::getTime, - resourceUsageCalculatorFactory - ); + queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService, clock::getTime); } @After diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java deleted file mode 100644 index 943a15825ea63..0000000000000 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageUtilTests.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.MutableQueryGroupFragment; -import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.ResourceUsageUtil.CpuUsageUtil; -import org.opensearch.wlm.tracker.ResourceUsageUtil.MemoryUsageUtil; - -import java.util.Map; - -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class ResourceUsageUtilTests extends OpenSearchTestCase { - ResourceUsageUtil sut; - ResourceUsageUtilFactory resourceUsageUtilFactory; - WorkloadManagementSettings settings; - - public void setUp() throws Exception { - super.setUp(); - resourceUsageUtilFactory = ResourceUsageUtilFactory.getInstance(); - settings = mock(WorkloadManagementSettings.class); - } - - public void testFactoryClass() { - assertTrue(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU) instanceof CpuUsageUtil); - assertTrue(resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY) instanceof MemoryUsageUtil); - assertThrows(IllegalArgumentException.class, () -> resourceUsageUtilFactory.getInstanceForResourceType(null)); - } - - public void testCpuUsageUtil() { - sut = resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.CPU); - QueryGroup queryGroup = new QueryGroup( - "testQG", - new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.50)) - ); - when(settings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); - - assertExpectedValues(queryGroup); - } - - public void testMemoryUsageUtil() { - sut = resourceUsageUtilFactory.getInstanceForResourceType(ResourceType.MEMORY); - QueryGroup queryGroup = new QueryGroup( - "testQG", - new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.50)) - ); - when(settings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); - assertExpectedValues(queryGroup); - } - - private void assertExpectedValues(QueryGroup queryGroup) { - double normalisedThreshold = sut.getNormalisedThreshold(queryGroup, settings); - assertEquals(0.45, normalisedThreshold, MIN_VALUE); - - assertEquals(0.1, sut.getExcessUsage(queryGroup, 0.55, settings), MIN_VALUE); - assertTrue(sut.isBreachingThresholdFor(queryGroup, 0.55, settings)); - } -} From 8aede330f4feb9b5024d907d92c58ad6dbb019b3 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 5 Sep 2024 21:21:11 -0700 Subject: [PATCH 23/47] change annotation version and task selection strategy Signed-off-by: Kaushal Kumar --- .../cluster/metadata/QueryGroup.java | 2 +- .../org/opensearch/wlm/QueryGroupTask.java | 2 +- .../wlm/WorkloadManagementSettings.java | 2 +- ...eConsumingTaskFirstSelectionStrategy.java} | 17 +++---- .../wlm/cancellation/TaskCanceller.java | 26 +++++------ ...umingTaskFirstSelectionStrategyTests.java} | 46 ++++++++++--------- .../wlm/cancellation/TaskCancellerTests.java | 16 +++---- 7 files changed, 58 insertions(+), 53 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{LongestTaskRunningFirstSelectionStrategy.java => HighestResourceConsumingTaskFirstSelectionStrategy.java} (73%) rename server/src/test/java/org/opensearch/wlm/cancellation/{LongestTaskRunningFirstSelectionStrategyTests.java => HighestResourceConsumingTaskFirstSelectionStrategyTests.java} (67%) diff --git a/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java b/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java index 94afd3f12fa93..0eeafdc8f5eed 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/QueryGroup.java @@ -42,7 +42,7 @@ * "updated_at": 4513232415 * } */ -@PublicApi(since = "2.17.0") +@PublicApi(since = "2.18.0") public class QueryGroup extends AbstractDiffable implements ToXContentObject { public static final String _ID_STRING = "_id"; diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index 1fb9a0b58c8de..6ffc2fe9e802e 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -25,7 +25,7 @@ /** * Base class to define QueryGroup tasks */ -@PublicApi(since = "2.17.0") +@PublicApi(since = "2.18.0") public class QueryGroupTask extends CancellableTask { private static final Logger logger = LogManager.getLogger(QueryGroupTask.class); diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index e9240f5517e85..b3577c1b3219d 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -16,7 +16,7 @@ /** * Main class to declare Workload Management related settings */ -@PublicApi(since = "2.17.0") +@PublicApi(since = "2.18.0") public class WorkloadManagementSettings { private static final Double DEFAULT_NODE_LEVEL_MEMORY_REJECTION_THRESHOLD = 0.8; private static final Double DEFAULT_NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD = 0.9; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java similarity index 73% rename from server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java rename to server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java index f05149c935e3f..1708a6ae940a1 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java @@ -21,28 +21,29 @@ import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; /** - * Represents the longest running task first selection strategy. + * Represents the highest resource consuming task first selection strategy. */ -public class LongestTaskRunningFirstSelectionStrategy implements TaskSelectionStrategy { +public class HighestResourceConsumingTaskFirstSelectionStrategy implements TaskSelectionStrategy { private final Supplier nanoTimeSupplier; - public LongestTaskRunningFirstSelectionStrategy() { + public HighestResourceConsumingTaskFirstSelectionStrategy() { this(System::nanoTime); } - public LongestTaskRunningFirstSelectionStrategy(Supplier nanoTimeSupplier) { + public HighestResourceConsumingTaskFirstSelectionStrategy(Supplier nanoTimeSupplier) { this.nanoTimeSupplier = nanoTimeSupplier; } /** * Returns a comparator that defines the sorting condition for tasks. - * This is the default implementation since the longest running tasks are the likely to regress the performance. + * This is the default implementation since the most resource consuming tasks are the likely to regress the performance. + * from resiliency point of view it makes sense to cancel them first * * @return The comparator */ - protected Comparator sortingCondition() { - return Comparator.comparingLong(QueryGroupTask::getStartTime); + private Comparator sortingCondition(ResourceType resourceType) { + return Comparator.comparingDouble(task -> resourceType.calculateTaskUsage(task, nanoTimeSupplier)); } /** @@ -63,7 +64,7 @@ public List selectTasksForCancellation(List task return Collections.emptyList(); } - List sortedTasks = tasks.stream().sorted(sortingCondition()).collect(Collectors.toList()); + List sortedTasks = tasks.stream().sorted(sortingCondition(resourceType).reversed()).collect(Collectors.toList()); List selectedTasks = new ArrayList<>(); double accumulated = 0; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index 3caa5c0cc062e..f0175bc884ab4 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -29,7 +29,7 @@ /** * Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria. - * This class utilizes a strategy pattern through {@link LongestTaskRunningFirstSelectionStrategy} to identify tasks that exceed + * This class utilizes a strategy pattern through {@link HighestResourceConsumingTaskFirstSelectionStrategy} to identify tasks that exceed * predefined resource usage limits and are therefore eligible for cancellation. * *

The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its @@ -40,24 +40,24 @@ * views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the * identification and cancellation of tasks that threaten to breach QueryGroup resource limits.

* - * @see LongestTaskRunningFirstSelectionStrategy + * @see HighestResourceConsumingTaskFirstSelectionStrategy * @see QueryGroup * @see ResourceType */ public class TaskCanceller { public static final double MIN_VALUE = 1e-9; - protected final WorkloadManagementSettings workloadManagementSettings; - protected final TaskSelectionStrategy taskSelectionStrategy; + private final WorkloadManagementSettings workloadManagementSettings; + private final TaskSelectionStrategy taskSelectionStrategy; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object - protected final Map queryGroupLevelResourceUsageViews; - protected final Collection activeQueryGroups; - protected final Collection deletedQueryGroups; - protected BooleanSupplier isNodeInDuress; + private final Map queryGroupLevelResourceUsageViews; + private final Collection activeQueryGroups; + private final Collection deletedQueryGroups; + private BooleanSupplier isNodeInDuress; public TaskCanceller( WorkloadManagementSettings workloadManagementSettings, - LongestTaskRunningFirstSelectionStrategy taskSelectionStrategy, + HighestResourceConsumingTaskFirstSelectionStrategy taskSelectionStrategy, Map queryGroupLevelResourceUsageViews, Collection activeQueryGroups, Collection deletedQueryGroups, @@ -106,7 +106,7 @@ private void cancelTasksFromDeletedQueryGroups() { * * @return List of tasks that can be cancelled */ - protected List getAllCancellableTasks(ResiliencyMode resiliencyMode) { + List getAllCancellableTasks(ResiliencyMode resiliencyMode) { return getAllCancellableTasks(getQueryGroupsToCancelFrom(resiliencyMode)); } @@ -115,7 +115,7 @@ protected List getAllCancellableTasks(ResiliencyMode resilienc * * @return List of tasks that can be cancelled */ - protected List getAllCancellableTasks(Collection queryGroups) { + List getAllCancellableTasks(Collection queryGroups) { return queryGroups.stream().flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()).collect(Collectors.toList()); } @@ -163,7 +163,7 @@ private void cancelTasks(List cancellableTasks) { * @param queryGroup The QueryGroup from which to get cancellable tasks * @return List of tasks that can be cancelled */ - protected List getCancellableTasksFrom(QueryGroup queryGroup) { + List getCancellableTasksFrom(QueryGroup queryGroup) { return TRACKED_RESOURCES.stream() .filter(resourceType -> shouldCancelTasks(queryGroup, resourceType)) .flatMap(resourceType -> getTaskCancellations(queryGroup, resourceType).stream()) @@ -208,7 +208,7 @@ private TaskCancellation createTaskCancellation(CancellableTask task, String can return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel)); } - protected List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { + List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { List tasks = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(); List taskCancellations = new ArrayList<>(); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java similarity index 67% rename from server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java index a93b1acf67fa5..b534efb7a2d20 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/LongestTaskRunningFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java @@ -22,39 +22,46 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.stream.IntStream; import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; -public class LongestTaskRunningFirstSelectionStrategyTests extends OpenSearchTestCase { +public class HighestResourceConsumingTaskFirstSelectionStrategyTests extends OpenSearchTestCase { private TestClock clock; public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { clock = new TestClock(); - LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = - new LongestTaskRunningFirstSelectionStrategy(clock::getTime); - long thresholdInLong = 100L; - double reduceBy = 50.0 / HEAP_SIZE_BYTES; + HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new HighestResourceConsumingTaskFirstSelectionStrategy(clock::getTime); + double reduceBy = 50000.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation( + List tasks = getListOfTasks(100); + List selectedTasks = testHighestResourceConsumingTaskFirstSelectionStrategy.selectTasksForCancellation( tasks, reduceBy, resourceType ); assertFalse(selectedTasks.isEmpty()); + boolean sortedInDescendingResourceUsage = IntStream.range(0, selectedTasks.size() - 1) + .noneMatch( + index -> ResourceType.MEMORY.calculateTaskUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.calculateTaskUsage( + selectedTasks.get(index + 1), + null + ) + ); + assertTrue(sortedInDescendingResourceUsage); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { - LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = - new LongestTaskRunningFirstSelectionStrategy(); - long thresholdInLong = 100L; + HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new HighestResourceConsumingTaskFirstSelectionStrategy(); double reduceBy = -50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); + List tasks = getListOfTasks(3); try { - testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); + testHighestResourceConsumingTaskFirstSelectionStrategy.selectTasksForCancellation(tasks, reduceBy, resourceType); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); assertEquals("limit has to be greater than zero", e.getMessage()); @@ -62,13 +69,12 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { - LongestTaskRunningFirstSelectionStrategy testLongestTaskRunningFirstSelectionStrategy = - new LongestTaskRunningFirstSelectionStrategy(); - long thresholdInLong = 100L; + HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new HighestResourceConsumingTaskFirstSelectionStrategy(); double reduceBy = 0.0; ResourceType resourceType = ResourceType.MEMORY; - List tasks = getListOfTasks(thresholdInLong); - List selectedTasks = testLongestTaskRunningFirstSelectionStrategy.selectTasksForCancellation( + List tasks = getListOfTasks(50); + List selectedTasks = testHighestResourceConsumingTaskFirstSelectionStrategy.selectTasksForCancellation( tasks, reduceBy, resourceType @@ -87,10 +93,10 @@ private boolean tasksUsageMeetsThreshold(List selectedTasks, dou return false; } - private List getListOfTasks(long totalMemory) { + private List getListOfTasks(int numberOfTasks) { List tasks = new ArrayList<>(); - while (totalMemory > 0) { + while (tasks.size() < numberOfTasks) { long id = randomLong(); final QueryGroupTask task = getRandomSearchTask(id); long initial_memory = randomLongBetween(1, 100); @@ -101,8 +107,6 @@ private List getListOfTasks(long totalMemory) { long memory = initial_memory + randomLongBetween(1, 10000); - totalMemory -= memory - initial_memory; - ResourceUsageMetric[] taskResourceMetrics = new ResourceUsageMetric[] { new ResourceUsageMetric(ResourceStats.MEMORY, memory), }; task.updateThreadResourceStats(id, ResourceStatsType.WORKER_STATS, taskResourceMetrics); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java index 26fccc807aa5f..655d3aea9d5d7 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java @@ -46,7 +46,7 @@ private static class TestTaskCancellerImpl extends TaskCanceller { public TestTaskCancellerImpl( WorkloadManagementSettings workloadManagementSettings, - LongestTaskRunningFirstSelectionStrategy longestTaskRunningFirstSelectionStrategy, + HighestResourceConsumingTaskFirstSelectionStrategy highestResourceConsumingTaskFirstSelectionStrategy, Map queryGroupLevelViews, Set activeQueryGroups, Set deletedQueryGroups, @@ -54,7 +54,7 @@ public TestTaskCancellerImpl( ) { super( workloadManagementSettings, - longestTaskRunningFirstSelectionStrategy, + highestResourceConsumingTaskFirstSelectionStrategy, queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -81,7 +81,7 @@ public void setup() { when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -208,7 +208,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -241,7 +241,7 @@ public void testCancelTasks_cancelsGivenTasks() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -301,7 +301,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -372,7 +372,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -432,7 +432,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new LongestTaskRunningFirstSelectionStrategy(), + new HighestResourceConsumingTaskFirstSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, From 623f6f8999df78dcca7d84d85b59574ce194699b Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 6 Sep 2024 12:39:03 -0700 Subject: [PATCH 24/47] rename a util class Signed-off-by: Kaushal Kumar --- .../java/org/opensearch/wlm/ResourceType.java | 16 ++++++++-------- ...sourceUsageUtil.java => QueryGroupUsage.java} | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) rename server/src/main/java/org/opensearch/wlm/tracker/{ResourceUsageUtil.java => QueryGroupUsage.java} (97%) diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index b2a75a60e23b5..fa00b917ee259 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -13,8 +13,8 @@ import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.wlm.tracker.CpuUsageCalculator; import org.opensearch.wlm.tracker.MemoryUsageCalculator; +import org.opensearch.wlm.tracker.QueryGroupUsage; import org.opensearch.wlm.tracker.ResourceUsageCalculator; -import org.opensearch.wlm.tracker.ResourceUsageUtil; import java.io.IOException; import java.util.List; @@ -27,13 +27,13 @@ */ @PublicApi(since = "2.17.0") public enum ResourceType { - CPU("cpu", true, CpuUsageCalculator.INSTANCE, new ResourceUsageUtil() { + CPU("cpu", true, CpuUsageCalculator.INSTANCE, new QueryGroupUsage() { @Override protected double getNormalisedThreshold(QueryGroup queryGroup) { return queryGroup.getResourceLimits().get(ResourceType.CPU) * getSettings().getNodeLevelCpuCancellationThreshold(); } }), - MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new ResourceUsageUtil() { + MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new QueryGroupUsage() { @Override protected double getNormalisedThreshold(QueryGroup queryGroup) { return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * getSettings().getNodeLevelMemoryCancellationThreshold(); @@ -43,14 +43,14 @@ protected double getNormalisedThreshold(QueryGroup queryGroup) { private final String name; private final boolean statsEnabled; private final ResourceUsageCalculator resourceUsageCalculator; - private final ResourceUsageUtil resourceUsageUtil; + private final QueryGroupUsage queryGroupUsage; private static List sortedValues = List.of(CPU, MEMORY); - ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, ResourceUsageUtil resourceUsageUtil) { + ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, QueryGroupUsage queryGroupUsage) { this.name = name; this.statsEnabled = statsEnabled; this.resourceUsageCalculator = resourceUsageCalculator; - this.resourceUsageUtil = resourceUsageUtil; + this.queryGroupUsage = queryGroupUsage; } /** @@ -92,11 +92,11 @@ public boolean isBreachingThreshold(QueryGroup queryGroup, double currentUsage) } public double getExcessUsage(QueryGroup queryGroup, double currentUsage) { - return resourceUsageUtil.getExcessUsage(queryGroup, currentUsage); + return queryGroupUsage.getExcessUsage(queryGroup, currentUsage); } public void setWorkloadManagementSettings(WorkloadManagementSettings settings) { - resourceUsageUtil.setSettings(settings); + queryGroupUsage.setSettings(settings); } public static List getSortedValues() { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java similarity index 97% rename from server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java rename to server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java index dc23914ffa6bb..029405d431875 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageUtil.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java @@ -14,7 +14,7 @@ /** * Utility class to provide utility methods at query group level */ -public abstract class ResourceUsageUtil { +public abstract class QueryGroupUsage { private WorkloadManagementSettings settings; public WorkloadManagementSettings getSettings() { From 9e2e3eac445d06f55e429ddda6774a284103c1df Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 6 Sep 2024 13:40:00 -0700 Subject: [PATCH 25/47] remove wrappers from resource type Signed-off-by: Kaushal Kumar --- .../java/org/opensearch/wlm/ResourceType.java | 33 ++++++------------- ...ceConsumingTaskFirstSelectionStrategy.java | 4 +-- .../wlm/cancellation/TaskCanceller.java | 8 +++-- ...QueryGroupResourceUsageTrackerService.java | 2 +- ...pUsage.java => QueryGroupUsageHelper.java} | 4 ++- .../wlm/tracker/ResourceUsageCalculator.java | 2 ++ ...sumingTaskFirstSelectionStrategyTests.java | 4 +-- .../tracker/ResourceUsageCalculatorTests.java | 8 ++--- 8 files changed, 29 insertions(+), 36 deletions(-) rename server/src/main/java/org/opensearch/wlm/tracker/{QueryGroupUsage.java => QueryGroupUsageHelper.java} (93%) diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index fa00b917ee259..856005f866bc7 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -13,12 +13,11 @@ import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.wlm.tracker.CpuUsageCalculator; import org.opensearch.wlm.tracker.MemoryUsageCalculator; -import org.opensearch.wlm.tracker.QueryGroupUsage; +import org.opensearch.wlm.tracker.QueryGroupUsageHelper; import org.opensearch.wlm.tracker.ResourceUsageCalculator; import java.io.IOException; import java.util.List; -import java.util.function.Supplier; /** * Enum to hold the resource type @@ -27,13 +26,13 @@ */ @PublicApi(since = "2.17.0") public enum ResourceType { - CPU("cpu", true, CpuUsageCalculator.INSTANCE, new QueryGroupUsage() { + CPU("cpu", true, CpuUsageCalculator.INSTANCE, new QueryGroupUsageHelper() { @Override protected double getNormalisedThreshold(QueryGroup queryGroup) { return queryGroup.getResourceLimits().get(ResourceType.CPU) * getSettings().getNodeLevelCpuCancellationThreshold(); } }), - MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new QueryGroupUsage() { + MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new QueryGroupUsageHelper() { @Override protected double getNormalisedThreshold(QueryGroup queryGroup) { return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * getSettings().getNodeLevelMemoryCancellationThreshold(); @@ -43,14 +42,14 @@ protected double getNormalisedThreshold(QueryGroup queryGroup) { private final String name; private final boolean statsEnabled; private final ResourceUsageCalculator resourceUsageCalculator; - private final QueryGroupUsage queryGroupUsage; + private final QueryGroupUsageHelper queryGroupUsageHelper; private static List sortedValues = List.of(CPU, MEMORY); - ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, QueryGroupUsage queryGroupUsage) { + ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, QueryGroupUsageHelper queryGroupUsageHelper) { this.name = name; this.statsEnabled = statsEnabled; this.resourceUsageCalculator = resourceUsageCalculator; - this.queryGroupUsage = queryGroupUsage; + this.queryGroupUsageHelper = queryGroupUsageHelper; } /** @@ -79,24 +78,12 @@ public boolean hasStatsEnabled() { return statsEnabled; } - public double calculateQueryGroupUsage(List tasks, Supplier nanoTimeSupplier) { - return resourceUsageCalculator.calculateResourceUsage(tasks, nanoTimeSupplier); + public ResourceUsageCalculator getResourceUsageCalculator() { + return resourceUsageCalculator; } - public double calculateTaskUsage(QueryGroupTask task, Supplier nanoTimeSupplier) { - return resourceUsageCalculator.calculateTaskResourceUsage(task, nanoTimeSupplier); - } - - public boolean isBreachingThreshold(QueryGroup queryGroup, double currentUsage) { - return getExcessUsage(queryGroup, currentUsage) > 0; - } - - public double getExcessUsage(QueryGroup queryGroup, double currentUsage) { - return queryGroupUsage.getExcessUsage(queryGroup, currentUsage); - } - - public void setWorkloadManagementSettings(WorkloadManagementSettings settings) { - queryGroupUsage.setSettings(settings); + public QueryGroupUsageHelper getQueryGroupUsage() { + return queryGroupUsageHelper; } public static List getSortedValues() { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java index 1708a6ae940a1..ec59b1645dd62 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java @@ -43,7 +43,7 @@ public HighestResourceConsumingTaskFirstSelectionStrategy(Supplier nanoTim * @return The comparator */ private Comparator sortingCondition(ResourceType resourceType) { - return Comparator.comparingDouble(task -> resourceType.calculateTaskUsage(task, nanoTimeSupplier)); + return Comparator.comparingDouble(task -> resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier)); } /** @@ -70,7 +70,7 @@ public List selectTasksForCancellation(List task double accumulated = 0; for (QueryGroupTask task : sortedTasks) { selectedTasks.add(task); - accumulated += resourceType.calculateTaskUsage(task, nanoTimeSupplier); + accumulated += resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier); if ((accumulated - limit) > MIN_VALUE) { break; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index f0175bc884ab4..60656fcdb0828 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -16,6 +16,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.QueryGroupUsageHelper; import java.util.ArrayList; import java.util.Collection; @@ -69,7 +70,7 @@ public TaskCanceller( this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; - TRACKED_RESOURCES.forEach(resourceType -> resourceType.setWorkloadManagementSettings(workloadManagementSettings)); + TRACKED_RESOURCES.forEach(resourceType -> resourceType.getQueryGroupUsage().setSettings(workloadManagementSettings)); } /** @@ -137,7 +138,7 @@ private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMod for (ResourceType resourceType : TRACKED_RESOURCES) { if (queryGroup.getResourceLimits().containsKey(resourceType)) { final double currentUsage = queryGroupResourcesUsage.get(resourceType); - if (resourceType.isBreachingThreshold(queryGroup, currentUsage)) { + if (resourceType.getQueryGroupUsage().isBreachingThresholdFor(queryGroup, currentUsage)) { queryGroupsToCancelFrom.add(queryGroup); break; } @@ -230,7 +231,8 @@ private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) final QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); final double currentUsage = queryGroupResourceUsageView.getResourceUsageData().get(resourceType); - return resourceType.getExcessUsage(queryGroup, currentUsage); + QueryGroupUsageHelper queryGroupUsageHelper = resourceType.getQueryGroupUsage(); + return queryGroupUsageHelper.getExcessUsage(queryGroup, currentUsage); } private void callbackOnCancel() { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index 0e7fab599dfcb..5d7ee90ac562b 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -52,7 +52,7 @@ public Map constructQueryGroupLevelUsa // Compute the QueryGroup resource usage final Map resourceUsage = new HashMap<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - double usage = resourceType.calculateQueryGroupUsage(queryGroupEntry.getValue(), nanoTimeSupplier); + double usage = resourceType.getResourceUsageCalculator().calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); resourceUsage.put(resourceType, usage); } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java similarity index 93% rename from server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java rename to server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java index 029405d431875..0053c21e3b78b 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsage.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java @@ -9,12 +9,14 @@ package org.opensearch.wlm.tracker; import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.common.annotation.PublicApi; import org.opensearch.wlm.WorkloadManagementSettings; /** * Utility class to provide utility methods at query group level */ -public abstract class QueryGroupUsage { +@PublicApi(since = "2.18.0") +public abstract class QueryGroupUsageHelper { private WorkloadManagementSettings settings; public WorkloadManagementSettings getSettings() { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java index a5777b36e87e4..2c30687b3d259 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java @@ -8,6 +8,7 @@ package org.opensearch.wlm.tracker; +import org.opensearch.common.annotation.PublicApi; import org.opensearch.wlm.QueryGroupTask; import java.util.List; @@ -16,6 +17,7 @@ /** * This class is used to track query group level resource usage */ +@PublicApi(since = "2.18.0") public interface ResourceUsageCalculator { /** * calculates the current resource usage for the query group diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java index b534efb7a2d20..135613e2ef65a 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java @@ -45,7 +45,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea assertFalse(selectedTasks.isEmpty()); boolean sortedInDescendingResourceUsage = IntStream.range(0, selectedTasks.size() - 1) .noneMatch( - index -> ResourceType.MEMORY.calculateTaskUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.calculateTaskUsage( + index -> ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage( selectedTasks.get(index + 1), null ) @@ -85,7 +85,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua private boolean tasksUsageMeetsThreshold(List selectedTasks, double threshold) { double memory = 0; for (QueryGroupTask task : selectedTasks) { - memory += ResourceType.MEMORY.calculateTaskUsage(task, clock::getTime); + memory += ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(task, clock::getTime); if ((memory - threshold) > MIN_VALUE) { return true; } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index b0083d7b9e1e5..5235987e5d2ac 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -39,10 +39,10 @@ public void testQueryGroupCpuUsage() { double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123); - double actualUsage = ResourceType.CPU.calculateQueryGroupUsage(List.of(mockTask), clock::getTime); + double actualUsage = ResourceType.CPU.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask), clock::getTime); assertEquals(expectedQueryGroupCpuUsage, actualUsage, MIN_VALUE); - double taskResourceUsage = ResourceType.CPU.calculateTaskUsage(mockTask, clock::getTime); + double taskResourceUsage = ResourceType.CPU.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime); assertEquals(1.0, taskResourceUsage, MIN_VALUE); } @@ -50,11 +50,11 @@ public void testQueryGroupMemoryUsage() { TestClock clock = new TestClock(); QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123); - double actualMemoryUsage = ResourceType.MEMORY.calculateQueryGroupUsage(List.of(mockTask), clock::getTime); + double actualMemoryUsage = ResourceType.MEMORY.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask), clock::getTime); double expectedMemoryUsage = 200.0 / HEAP_SIZE_BYTES; assertEquals(expectedMemoryUsage, actualMemoryUsage, MIN_VALUE); - assertEquals(200.0 / HEAP_SIZE_BYTES, ResourceType.MEMORY.calculateTaskUsage(mockTask, clock::getTime), MIN_VALUE); + assertEquals(200.0 / HEAP_SIZE_BYTES, ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime), MIN_VALUE); } public static T createMockTaskWithResourceStats( From 34184efd2e36e45a15c4e7e21d8d30b2ee8b367d Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 6 Sep 2024 13:41:44 -0700 Subject: [PATCH 26/47] apply spotless Signed-off-by: Kaushal Kumar --- .../src/main/java/org/opensearch/wlm/ResourceType.java | 7 ++++++- ...ghestResourceConsumingTaskFirstSelectionStrategy.java | 4 +++- .../tracker/QueryGroupResourceUsageTrackerService.java | 3 ++- ...ResourceConsumingTaskFirstSelectionStrategyTests.java | 7 +++---- .../wlm/tracker/ResourceUsageCalculatorTests.java | 9 +++++++-- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index 856005f866bc7..b8e9ba8d02429 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -45,7 +45,12 @@ protected double getNormalisedThreshold(QueryGroup queryGroup) { private final QueryGroupUsageHelper queryGroupUsageHelper; private static List sortedValues = List.of(CPU, MEMORY); - ResourceType(String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, QueryGroupUsageHelper queryGroupUsageHelper) { + ResourceType( + String name, + boolean statsEnabled, + ResourceUsageCalculator resourceUsageCalculator, + QueryGroupUsageHelper queryGroupUsageHelper + ) { this.name = name; this.statsEnabled = statsEnabled; this.resourceUsageCalculator = resourceUsageCalculator; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java index ec59b1645dd62..c32272f1b8492 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java @@ -43,7 +43,9 @@ public HighestResourceConsumingTaskFirstSelectionStrategy(Supplier nanoTim * @return The comparator */ private Comparator sortingCondition(ResourceType resourceType) { - return Comparator.comparingDouble(task -> resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier)); + return Comparator.comparingDouble( + task -> resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier) + ); } /** diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index 5d7ee90ac562b..c7d9ff00929f9 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -52,7 +52,8 @@ public Map constructQueryGroupLevelUsa // Compute the QueryGroup resource usage final Map resourceUsage = new HashMap<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - double usage = resourceType.getResourceUsageCalculator().calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); + double usage = resourceType.getResourceUsageCalculator() + .calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); resourceUsage.put(resourceType, usage); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java index 135613e2ef65a..1582a45897db3 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java @@ -45,10 +45,9 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea assertFalse(selectedTasks.isEmpty()); boolean sortedInDescendingResourceUsage = IntStream.range(0, selectedTasks.size() - 1) .noneMatch( - index -> ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage( - selectedTasks.get(index + 1), - null - ) + index -> ResourceType.MEMORY.getResourceUsageCalculator() + .calculateTaskResourceUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.getResourceUsageCalculator() + .calculateTaskResourceUsage(selectedTasks.get(index + 1), null) ); assertTrue(sortedInDescendingResourceUsage); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index 5235987e5d2ac..ebc9f366d369a 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -50,11 +50,16 @@ public void testQueryGroupMemoryUsage() { TestClock clock = new TestClock(); QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123); - double actualMemoryUsage = ResourceType.MEMORY.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask), clock::getTime); + double actualMemoryUsage = ResourceType.MEMORY.getResourceUsageCalculator() + .calculateResourceUsage(List.of(mockTask), clock::getTime); double expectedMemoryUsage = 200.0 / HEAP_SIZE_BYTES; assertEquals(expectedMemoryUsage, actualMemoryUsage, MIN_VALUE); - assertEquals(200.0 / HEAP_SIZE_BYTES, ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime), MIN_VALUE); + assertEquals( + 200.0 / HEAP_SIZE_BYTES, + ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime), + MIN_VALUE + ); } public static T createMockTaskWithResourceStats( From 91893e7e2eab70009ebc67f0ff082278bcc2ae5b Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 9 Sep 2024 10:29:55 -0700 Subject: [PATCH 27/47] address comments Signed-off-by: Kaushal Kumar --- .../java/org/opensearch/wlm/ResourceType.java | 27 +++------ ...ceConsumingTaskFirstSelectionStrategy.java | 17 +----- .../wlm/cancellation/TaskCanceller.java | 21 ++++--- .../wlm/tracker/CpuUsageCalculator.java | 17 ++++-- .../wlm/tracker/MemoryUsageCalculator.java | 9 ++- ...QueryGroupResourceUsageTrackerService.java | 10 ++-- .../wlm/tracker/QueryGroupUsageHelper.java | 59 ------------------- .../wlm/tracker/ResourceUsageCalculator.java | 18 ++++-- ...sumingTaskFirstSelectionStrategyTests.java | 11 ++-- .../tracker/ResourceUsageCalculatorTests.java | 12 ++-- 10 files changed, 63 insertions(+), 138 deletions(-) delete mode 100644 server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java diff --git a/server/src/main/java/org/opensearch/wlm/ResourceType.java b/server/src/main/java/org/opensearch/wlm/ResourceType.java index b8e9ba8d02429..a560268a66853 100644 --- a/server/src/main/java/org/opensearch/wlm/ResourceType.java +++ b/server/src/main/java/org/opensearch/wlm/ResourceType.java @@ -8,16 +8,15 @@ package org.opensearch.wlm; -import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.common.annotation.PublicApi; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.wlm.tracker.CpuUsageCalculator; import org.opensearch.wlm.tracker.MemoryUsageCalculator; -import org.opensearch.wlm.tracker.QueryGroupUsageHelper; import org.opensearch.wlm.tracker.ResourceUsageCalculator; import java.io.IOException; import java.util.List; +import java.util.function.Function; /** * Enum to hold the resource type @@ -26,35 +25,25 @@ */ @PublicApi(since = "2.17.0") public enum ResourceType { - CPU("cpu", true, CpuUsageCalculator.INSTANCE, new QueryGroupUsageHelper() { - @Override - protected double getNormalisedThreshold(QueryGroup queryGroup) { - return queryGroup.getResourceLimits().get(ResourceType.CPU) * getSettings().getNodeLevelCpuCancellationThreshold(); - } - }), - MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, new QueryGroupUsageHelper() { - @Override - protected double getNormalisedThreshold(QueryGroup queryGroup) { - return queryGroup.getResourceLimits().get(ResourceType.MEMORY) * getSettings().getNodeLevelMemoryCancellationThreshold(); - } - }); + CPU("cpu", true, CpuUsageCalculator.INSTANCE, WorkloadManagementSettings::getNodeLevelCpuCancellationThreshold), + MEMORY("memory", true, MemoryUsageCalculator.INSTANCE, WorkloadManagementSettings::getNodeLevelMemoryCancellationThreshold); private final String name; private final boolean statsEnabled; private final ResourceUsageCalculator resourceUsageCalculator; - private final QueryGroupUsageHelper queryGroupUsageHelper; + private final Function nodeLevelThresholdSupplier; private static List sortedValues = List.of(CPU, MEMORY); ResourceType( String name, boolean statsEnabled, ResourceUsageCalculator resourceUsageCalculator, - QueryGroupUsageHelper queryGroupUsageHelper + Function nodeLevelThresholdSupplier ) { this.name = name; this.statsEnabled = statsEnabled; this.resourceUsageCalculator = resourceUsageCalculator; - this.queryGroupUsageHelper = queryGroupUsageHelper; + this.nodeLevelThresholdSupplier = nodeLevelThresholdSupplier; } /** @@ -87,8 +76,8 @@ public ResourceUsageCalculator getResourceUsageCalculator() { return resourceUsageCalculator; } - public QueryGroupUsageHelper getQueryGroupUsage() { - return queryGroupUsageHelper; + public double getNodeLevelThreshold(WorkloadManagementSettings settings) { + return nodeLevelThresholdSupplier.apply(settings); } public static List getSortedValues() { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java index c32272f1b8492..3bceae9c8ffb1 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java @@ -15,7 +15,6 @@ import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.function.Supplier; import java.util.stream.Collectors; import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; @@ -25,15 +24,7 @@ */ public class HighestResourceConsumingTaskFirstSelectionStrategy implements TaskSelectionStrategy { - private final Supplier nanoTimeSupplier; - - public HighestResourceConsumingTaskFirstSelectionStrategy() { - this(System::nanoTime); - } - - public HighestResourceConsumingTaskFirstSelectionStrategy(Supplier nanoTimeSupplier) { - this.nanoTimeSupplier = nanoTimeSupplier; - } + public HighestResourceConsumingTaskFirstSelectionStrategy() {} /** * Returns a comparator that defines the sorting condition for tasks. @@ -43,9 +34,7 @@ public HighestResourceConsumingTaskFirstSelectionStrategy(Supplier nanoTim * @return The comparator */ private Comparator sortingCondition(ResourceType resourceType) { - return Comparator.comparingDouble( - task -> resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier) - ); + return Comparator.comparingDouble(task -> resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task)); } /** @@ -72,7 +61,7 @@ public List selectTasksForCancellation(List task double accumulated = 0; for (QueryGroupTask task : sortedTasks) { selectedTasks.add(task); - accumulated += resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task, nanoTimeSupplier); + accumulated += resourceType.getResourceUsageCalculator().calculateTaskResourceUsage(task); if ((accumulated - limit) > MIN_VALUE) { break; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index 60656fcdb0828..f86e4291eb181 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -16,7 +16,6 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.tracker.QueryGroupUsageHelper; import java.util.ArrayList; import java.util.Collection; @@ -70,7 +69,6 @@ public TaskCanceller( this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; - TRACKED_RESOURCES.forEach(resourceType -> resourceType.getQueryGroupUsage().setSettings(workloadManagementSettings)); } /** @@ -132,13 +130,9 @@ private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMod if (queryGroup.getResiliencyMode() != resiliencyMode) { continue; } - Map queryGroupResourcesUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()) - .getResourceUsageData(); - for (ResourceType resourceType : TRACKED_RESOURCES) { if (queryGroup.getResourceLimits().containsKey(resourceType)) { - final double currentUsage = queryGroupResourcesUsage.get(resourceType); - if (resourceType.getQueryGroupUsage().isBreachingThresholdFor(queryGroup, currentUsage)) { + if (shouldCancelTasks(queryGroup, resourceType)) { queryGroupsToCancelFrom.add(queryGroup); break; } @@ -231,8 +225,17 @@ private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) final QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); final double currentUsage = queryGroupResourceUsageView.getResourceUsageData().get(resourceType); - QueryGroupUsageHelper queryGroupUsageHelper = resourceType.getQueryGroupUsage(); - return queryGroupUsageHelper.getExcessUsage(queryGroup, currentUsage); + return currentUsage - getNormalisedThreshold(queryGroup, resourceType); + } + + /** + * normalises configured value with respect to node level cancellation thresholds + * @param queryGroup instance + * @return normalised value with respect to node level cancellation thresholds + */ + private double getNormalisedThreshold(QueryGroup queryGroup, ResourceType resourceType) { + double nodeLevelCancellationThreshold = resourceType.getNodeLevelThreshold(workloadManagementSettings); + return queryGroup.getResourceLimits().get(resourceType) * nodeLevelCancellationThreshold; } private void callbackOnCancel() { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java index 3cbab3db010c6..533fbeecbe945 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java @@ -12,28 +12,33 @@ import org.opensearch.wlm.QueryGroupTask; import java.util.List; -import java.util.function.Supplier; +import java.util.function.LongSupplier; /** * class to help make cpu usage calculations for the query group */ -public class CpuUsageCalculator implements ResourceUsageCalculator { +public class CpuUsageCalculator extends ResourceUsageCalculator { // This value should be initialised at the start time of the process and be used throughout the codebase public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); public static final CpuUsageCalculator INSTANCE = new CpuUsageCalculator(); + private LongSupplier nanoTimeSupplier; private CpuUsageCalculator() {} + public void setNanoTimeSupplier(LongSupplier nanoTimeSupplier) { + this.nanoTimeSupplier = nanoTimeSupplier; + } + @Override - public double calculateResourceUsage(List tasks, Supplier timeSupplier) { - double usage = tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); + public double calculateResourceUsage(List tasks) { + double usage = tasks.stream().mapToDouble(this::calculateTaskResourceUsage).sum(); usage /= PROCESSOR_COUNT; return usage; } @Override - public double calculateTaskResourceUsage(QueryGroupTask task, Supplier nanoTimeSupplier) { - return (1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.get() - task.getStartTimeNanos()); + public double calculateTaskResourceUsage(QueryGroupTask task) { + return (1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.getAsLong() - task.getStartTimeNanos()); } } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java index dd72ac9007705..fb66ff47f58d0 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/MemoryUsageCalculator.java @@ -13,24 +13,23 @@ import org.opensearch.wlm.QueryGroupTask; import java.util.List; -import java.util.function.Supplier; /** * class to help make memory usage calculations for the query group */ -public class MemoryUsageCalculator implements ResourceUsageCalculator { +public class MemoryUsageCalculator extends ResourceUsageCalculator { public static final long HEAP_SIZE_BYTES = JvmStats.jvmStats().getMem().getHeapMax().getBytes(); public static final MemoryUsageCalculator INSTANCE = new MemoryUsageCalculator(); private MemoryUsageCalculator() {} @Override - public double calculateResourceUsage(List tasks, Supplier timeSupplier) { - return tasks.stream().mapToDouble(task -> calculateTaskResourceUsage(task, timeSupplier)).sum(); + public double calculateResourceUsage(List tasks) { + return tasks.stream().mapToDouble(this::calculateTaskResourceUsage).sum(); } @Override - public double calculateTaskResourceUsage(QueryGroupTask task, Supplier timeSupplier) { + public double calculateTaskResourceUsage(QueryGroupTask task) { return (1.0f * task.getTotalResourceUtilization(ResourceStats.MEMORY)) / HEAP_SIZE_BYTES; } } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index c7d9ff00929f9..f616f29a4d031 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -17,7 +17,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.function.Supplier; +import java.util.function.LongSupplier; import java.util.stream.Collectors; /** @@ -26,16 +26,15 @@ public class QueryGroupResourceUsageTrackerService { public static final EnumSet TRACKED_RESOURCES = EnumSet.allOf(ResourceType.class); private final TaskResourceTrackingService taskResourceTrackingService; - private final Supplier nanoTimeSupplier; /** * QueryGroupResourceTrackerService constructor * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, Supplier nanoTimeSupplier) { + public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, LongSupplier nanoTimeSupplier) { this.taskResourceTrackingService = taskResourceTrackingService; - this.nanoTimeSupplier = nanoTimeSupplier; + ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(nanoTimeSupplier); } /** @@ -52,8 +51,7 @@ public Map constructQueryGroupLevelUsa // Compute the QueryGroup resource usage final Map resourceUsage = new HashMap<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - double usage = resourceType.getResourceUsageCalculator() - .calculateResourceUsage(queryGroupEntry.getValue(), nanoTimeSupplier); + double usage = resourceType.getResourceUsageCalculator().calculateResourceUsage(queryGroupEntry.getValue()); resourceUsage.put(resourceType, usage); } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java deleted file mode 100644 index 0053c21e3b78b..0000000000000 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupUsageHelper.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.wlm.tracker; - -import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.common.annotation.PublicApi; -import org.opensearch.wlm.WorkloadManagementSettings; - -/** - * Utility class to provide utility methods at query group level - */ -@PublicApi(since = "2.18.0") -public abstract class QueryGroupUsageHelper { - private WorkloadManagementSettings settings; - - public WorkloadManagementSettings getSettings() { - return settings; - } - - /** - * WorkloadManagementSettings setter - * @param settings - */ - public void setSettings(WorkloadManagementSettings settings) { - this.settings = settings; - } - - /** - * Determines whether {@link QueryGroup} is breaching its threshold for the resource - * @param queryGroup - * @param currentUsage - * @return whether the query group is breaching threshold for this resource - */ - public boolean isBreachingThresholdFor(QueryGroup queryGroup, double currentUsage) { - return getExcessUsage(queryGroup, currentUsage) > 0; - } - - /** - * returns the value by which the resource usage is beyond the configured limit for the query group - * @param queryGroup instance - * @return the overshooting limit for the resource - */ - public double getExcessUsage(QueryGroup queryGroup, double currentUsage) { - return currentUsage - getNormalisedThreshold(queryGroup); - } - - /** - * normalises configured value with respect to node level cancellation thresholds - * @param queryGroup instance - * @return normalised value with respect to node level cancellation thresholds - */ - protected abstract double getNormalisedThreshold(QueryGroup queryGroup); -} diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java index 2c30687b3d259..e68693b9a6433 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java @@ -12,26 +12,32 @@ import org.opensearch.wlm.QueryGroupTask; import java.util.List; -import java.util.function.Supplier; +import java.util.function.LongSupplier; /** * This class is used to track query group level resource usage */ @PublicApi(since = "2.18.0") -public interface ResourceUsageCalculator { +public abstract class ResourceUsageCalculator { /** * calculates the current resource usage for the query group * * @param tasks list of tasks in the query group - * @param timeSupplier nano time supplier */ - double calculateResourceUsage(List tasks, Supplier timeSupplier); + public abstract double calculateResourceUsage(List tasks); /** * calculates the task level resource usage * @param task QueryGroupTask - * @param timeSupplier in nano seconds unit * @return task level resource usage */ - double calculateTaskResourceUsage(QueryGroupTask task, Supplier timeSupplier); + public abstract double calculateTaskResourceUsage(QueryGroupTask task); + + /** + * Since only few implementations might need this + * @param nanoTimeSupplier + */ + public void setNanoTimeSupplier(LongSupplier nanoTimeSupplier) { + + } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java index 1582a45897db3..96fe0f0462c77 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java @@ -17,7 +17,6 @@ import org.opensearch.test.OpenSearchTestCase; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; -import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import java.util.ArrayList; import java.util.Collections; @@ -28,12 +27,10 @@ import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; public class HighestResourceConsumingTaskFirstSelectionStrategyTests extends OpenSearchTestCase { - private TestClock clock; public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { - clock = new TestClock(); HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = - new HighestResourceConsumingTaskFirstSelectionStrategy(clock::getTime); + new HighestResourceConsumingTaskFirstSelectionStrategy(); double reduceBy = 50000.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(100); @@ -46,8 +43,8 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea boolean sortedInDescendingResourceUsage = IntStream.range(0, selectedTasks.size() - 1) .noneMatch( index -> ResourceType.MEMORY.getResourceUsageCalculator() - .calculateTaskResourceUsage(selectedTasks.get(index), null) < ResourceType.MEMORY.getResourceUsageCalculator() - .calculateTaskResourceUsage(selectedTasks.get(index + 1), null) + .calculateTaskResourceUsage(selectedTasks.get(index)) < ResourceType.MEMORY.getResourceUsageCalculator() + .calculateTaskResourceUsage(selectedTasks.get(index + 1)) ); assertTrue(sortedInDescendingResourceUsage); assertTrue(tasksUsageMeetsThreshold(selectedTasks, reduceBy)); @@ -84,7 +81,7 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqua private boolean tasksUsageMeetsThreshold(List selectedTasks, double threshold) { double memory = 0; for (QueryGroupTask task : selectedTasks) { - memory += ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(task, clock::getTime); + memory += ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(task); if ((memory - threshold) > MIN_VALUE) { return true; } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index ebc9f366d369a..ad6ce267b2452 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -39,25 +39,23 @@ public void testQueryGroupCpuUsage() { double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123); - double actualUsage = ResourceType.CPU.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask), clock::getTime); + ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(clock::getTime); + double actualUsage = ResourceType.CPU.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask)); assertEquals(expectedQueryGroupCpuUsage, actualUsage, MIN_VALUE); - double taskResourceUsage = ResourceType.CPU.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime); + double taskResourceUsage = ResourceType.CPU.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask); assertEquals(1.0, taskResourceUsage, MIN_VALUE); } public void testQueryGroupMemoryUsage() { - TestClock clock = new TestClock(); - QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 123); - double actualMemoryUsage = ResourceType.MEMORY.getResourceUsageCalculator() - .calculateResourceUsage(List.of(mockTask), clock::getTime); + double actualMemoryUsage = ResourceType.MEMORY.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask)); double expectedMemoryUsage = 200.0 / HEAP_SIZE_BYTES; assertEquals(expectedMemoryUsage, actualMemoryUsage, MIN_VALUE); assertEquals( 200.0 / HEAP_SIZE_BYTES, - ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask, clock::getTime), + ResourceType.MEMORY.getResourceUsageCalculator().calculateTaskResourceUsage(mockTask), MIN_VALUE ); } From 66e43b2dd2b1d540e29aa8759a65053f5706b117 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 9 Sep 2024 10:56:17 -0700 Subject: [PATCH 28/47] add rename changes Signed-off-by: Kaushal Kumar --- ...a => MaximumResourceTaskSelectionStrategy.java} | 4 ++-- .../opensearch/wlm/cancellation/TaskCanceller.java | 6 +++--- .../QueryGroupResourceUsageTrackerService.java | 10 +++++++--- ...MaximumResourceTaskSelectionStrategyTests.java} | 14 +++++++------- .../wlm/cancellation/TaskCancellerTests.java | 14 +++++++------- 5 files changed, 26 insertions(+), 22 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{HighestResourceConsumingTaskFirstSelectionStrategy.java => MaximumResourceTaskSelectionStrategy.java} (94%) rename server/src/test/java/org/opensearch/wlm/cancellation/{HighestResourceConsumingTaskFirstSelectionStrategyTests.java => MaximumResourceTaskSelectionStrategyTests.java} (88%) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java similarity index 94% rename from server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java rename to server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java index 3bceae9c8ffb1..9407fc32114d0 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java @@ -22,9 +22,9 @@ /** * Represents the highest resource consuming task first selection strategy. */ -public class HighestResourceConsumingTaskFirstSelectionStrategy implements TaskSelectionStrategy { +public class MaximumResourceTaskSelectionStrategy implements TaskSelectionStrategy { - public HighestResourceConsumingTaskFirstSelectionStrategy() {} + public MaximumResourceTaskSelectionStrategy() {} /** * Returns a comparator that defines the sorting condition for tasks. diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java index f86e4291eb181..355ebfb838a05 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java @@ -29,7 +29,7 @@ /** * Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria. - * This class utilizes a strategy pattern through {@link HighestResourceConsumingTaskFirstSelectionStrategy} to identify tasks that exceed + * This class utilizes a strategy pattern through {@link MaximumResourceTaskSelectionStrategy} to identify tasks that exceed * predefined resource usage limits and are therefore eligible for cancellation. * *

The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its @@ -40,7 +40,7 @@ * views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the * identification and cancellation of tasks that threaten to breach QueryGroup resource limits.

* - * @see HighestResourceConsumingTaskFirstSelectionStrategy + * @see MaximumResourceTaskSelectionStrategy * @see QueryGroup * @see ResourceType */ @@ -57,7 +57,7 @@ public class TaskCanceller { public TaskCanceller( WorkloadManagementSettings workloadManagementSettings, - HighestResourceConsumingTaskFirstSelectionStrategy taskSelectionStrategy, + MaximumResourceTaskSelectionStrategy taskSelectionStrategy, Map queryGroupLevelResourceUsageViews, Collection activeQueryGroups, Collection deletedQueryGroups, diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index f616f29a4d031..ddf65e61a1275 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -13,6 +13,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; +import java.util.EnumMap; import java.util.EnumSet; import java.util.HashMap; import java.util.List; @@ -49,14 +50,17 @@ public Map constructQueryGroupLevelUsa // Iterate over each QueryGroup entry for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { // Compute the QueryGroup resource usage - final Map resourceUsage = new HashMap<>(); + final Map queryGroupUsage = new EnumMap<>(ResourceType.class); for (ResourceType resourceType : TRACKED_RESOURCES) { double usage = resourceType.getResourceUsageCalculator().calculateResourceUsage(queryGroupEntry.getValue()); - resourceUsage.put(resourceType, usage); + queryGroupUsage.put(resourceType, usage); } // Add to the QueryGroup View - queryGroupViews.put(queryGroupEntry.getKey(), new QueryGroupLevelResourceUsageView(resourceUsage, queryGroupEntry.getValue())); + queryGroupViews.put( + queryGroupEntry.getKey(), + new QueryGroupLevelResourceUsageView(queryGroupUsage, queryGroupEntry.getValue()) + ); } return queryGroupViews; } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java similarity index 88% rename from server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java index 96fe0f0462c77..d522a71b630a5 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/HighestResourceConsumingTaskFirstSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java @@ -26,11 +26,11 @@ import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; -public class HighestResourceConsumingTaskFirstSelectionStrategyTests extends OpenSearchTestCase { +public class MaximumResourceTaskSelectionStrategyTests extends OpenSearchTestCase { public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGreaterThanZero() { - HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = - new HighestResourceConsumingTaskFirstSelectionStrategy(); + MaximumResourceTaskSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new MaximumResourceTaskSelectionStrategy(); double reduceBy = 50000.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(100); @@ -51,8 +51,8 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsGrea } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLesserThanZero() { - HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = - new HighestResourceConsumingTaskFirstSelectionStrategy(); + MaximumResourceTaskSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new MaximumResourceTaskSelectionStrategy(); double reduceBy = -50.0 / HEAP_SIZE_BYTES; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(3); @@ -65,8 +65,8 @@ public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsLess } public void testSelectTasksToCancelSelectsTasksMeetingThreshold_ifReduceByIsEqualToZero() { - HighestResourceConsumingTaskFirstSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = - new HighestResourceConsumingTaskFirstSelectionStrategy(); + MaximumResourceTaskSelectionStrategy testHighestResourceConsumingTaskFirstSelectionStrategy = + new MaximumResourceTaskSelectionStrategy(); double reduceBy = 0.0; ResourceType resourceType = ResourceType.MEMORY; List tasks = getListOfTasks(50); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java index 655d3aea9d5d7..8291b6c4b5b1f 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java @@ -46,7 +46,7 @@ private static class TestTaskCancellerImpl extends TaskCanceller { public TestTaskCancellerImpl( WorkloadManagementSettings workloadManagementSettings, - HighestResourceConsumingTaskFirstSelectionStrategy highestResourceConsumingTaskFirstSelectionStrategy, + MaximumResourceTaskSelectionStrategy highestResourceConsumingTaskFirstSelectionStrategy, Map queryGroupLevelViews, Set activeQueryGroups, Set deletedQueryGroups, @@ -81,7 +81,7 @@ public void setup() { when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -208,7 +208,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -241,7 +241,7 @@ public void testCancelTasks_cancelsGivenTasks() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -301,7 +301,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -372,7 +372,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, @@ -432,7 +432,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( workloadManagementSettings, - new HighestResourceConsumingTaskFirstSelectionStrategy(), + new MaximumResourceTaskSelectionStrategy(), queryGroupLevelViews, activeQueryGroups, deletedQueryGroups, From 981b15f06e373a8ddd85389eaf4c880c7a47f272 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 9 Sep 2024 16:23:22 -0700 Subject: [PATCH 29/47] address comments Signed-off-by: Kaushal Kumar --- .../MaximumResourceTaskSelectionStrategy.java | 2 +- ...ller.java => TaskCancellationService.java} | 15 ++-- .../wlm/tracker/CpuUsageCalculator.java | 1 + ...QueryGroupLevelResourceUsageViewTests.java | 2 +- ...mumResourceTaskSelectionStrategyTests.java | 2 +- ...java => TaskCancellationServiceTests.java} | 73 ++++++++++--------- .../tracker/ResourceUsageCalculatorTests.java | 2 +- ...rceUsageCalculatorTrackerServiceTests.java | 2 +- 8 files changed, 52 insertions(+), 47 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{TaskCanceller.java => TaskCancellationService.java} (94%) rename server/src/test/java/org/opensearch/wlm/cancellation/{TaskCancellerTests.java => TaskCancellationServiceTests.java} (89%) diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java index 9407fc32114d0..7216984da8aca 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java @@ -17,7 +17,7 @@ import java.util.List; import java.util.stream.Collectors; -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; /** * Represents the highest resource consuming task first selection strategy. diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java similarity index 94% rename from server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java rename to server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java index 355ebfb838a05..24cf9f3746404 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCanceller.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java @@ -16,6 +16,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import java.util.ArrayList; import java.util.Collection; @@ -44,28 +45,29 @@ * @see QueryGroup * @see ResourceType */ -public class TaskCanceller { +public class TaskCancellationService { public static final double MIN_VALUE = 1e-9; private final WorkloadManagementSettings workloadManagementSettings; private final TaskSelectionStrategy taskSelectionStrategy; + private final QueryGroupResourceUsageTrackerService resourceUsageTrackerService; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object - private final Map queryGroupLevelResourceUsageViews; + Map queryGroupLevelResourceUsageViews; private final Collection activeQueryGroups; private final Collection deletedQueryGroups; private BooleanSupplier isNodeInDuress; - public TaskCanceller( + public TaskCancellationService( WorkloadManagementSettings workloadManagementSettings, - MaximumResourceTaskSelectionStrategy taskSelectionStrategy, - Map queryGroupLevelResourceUsageViews, + TaskSelectionStrategy taskSelectionStrategy, + QueryGroupResourceUsageTrackerService resourceUsageTrackerService, Collection activeQueryGroups, Collection deletedQueryGroups, BooleanSupplier isNodeInDuress ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; - this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews; + this.resourceUsageTrackerService = resourceUsageTrackerService; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; this.isNodeInDuress = isNodeInDuress; @@ -75,6 +77,7 @@ public TaskCanceller( * Cancel tasks based on the implemented strategy. */ public final void cancelTasks() { + queryGroupLevelResourceUsageViews = resourceUsageTrackerService.constructQueryGroupLevelUsageViews(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits cancelTasks(ResiliencyMode.ENFORCED); // if the node is in duress, cancel tasks accordingly. diff --git a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java index 533fbeecbe945..772e698c324b3 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java @@ -31,6 +31,7 @@ public void setNanoTimeSupplier(LongSupplier nanoTimeSupplier) { @Override public double calculateResourceUsage(List tasks) { + assert nanoTimeSupplier != null : "nanoTimeSupplier has to be set in order to calculate the resource usage"; double usage = tasks.stream().mapToDouble(this::calculateTaskResourceUsage).sum(); usage /= PROCESSOR_COUNT; diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 0d254ad73f9f4..77fc6ac6e535b 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -14,7 +14,7 @@ import java.util.List; import java.util.Map; -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.opensearch.wlm.tracker.ResourceUsageCalculatorTests.createMockTaskWithResourceStats; diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java index d522a71b630a5..05d080d51aa65 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java @@ -23,7 +23,7 @@ import java.util.List; import java.util.stream.IntStream; -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; public class MaximumResourceTaskSelectionStrategyTests extends OpenSearchTestCase { diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java similarity index 89% rename from server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java index 8291b6c4b5b1f..07764b93a41ea 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellerTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java @@ -20,6 +20,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import org.junit.Before; @@ -30,44 +31,23 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.function.BooleanSupplier; import java.util.stream.Collectors; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class TaskCancellerTests extends OpenSearchTestCase { +public class TaskCancellationServiceTests extends OpenSearchTestCase { private static final String queryGroupId1 = "queryGroup1"; private static final String queryGroupId2 = "queryGroup2"; private TestClock clock; - private static class TestTaskCancellerImpl extends TaskCanceller { - - public TestTaskCancellerImpl( - WorkloadManagementSettings workloadManagementSettings, - MaximumResourceTaskSelectionStrategy highestResourceConsumingTaskFirstSelectionStrategy, - Map queryGroupLevelViews, - Set activeQueryGroups, - Set deletedQueryGroups, - BooleanSupplier isNodeInDuress - ) { - super( - workloadManagementSettings, - highestResourceConsumingTaskFirstSelectionStrategy, - queryGroupLevelViews, - activeQueryGroups, - deletedQueryGroups, - isNodeInDuress - ); - } - } - private Map queryGroupLevelViews; private Set activeQueryGroups; private Set deletedQueryGroups; - private TaskCanceller taskCancellation; + private TaskCancellationService taskCancellation; private WorkloadManagementSettings workloadManagementSettings; + private QueryGroupResourceUsageTrackerService resourceUsageTrackerService; @Before public void setup() { @@ -77,12 +57,14 @@ public void setup() { deletedQueryGroups = new HashSet<>(); clock = new TestClock(); + ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(clock::getTime); when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); - taskCancellation = new TestTaskCancellerImpl( + resourceUsageTrackerService = mock(QueryGroupResourceUsageTrackerService.class); + taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> false @@ -106,6 +88,7 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); assertEquals(2, cancellableTasksFrom.size()); @@ -137,6 +120,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); when(mockView.getResourceUsageData()).thenReturn(Map.of(resourceType, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); assertEquals(2, cancellableTasksFrom.size()); @@ -162,6 +146,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); @@ -185,6 +170,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); assertTrue(cancellableTasksFrom.isEmpty()); @@ -205,11 +191,12 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { QueryGroupLevelResourceUsageView mockView = createResourceUsageViewMock(); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( + TaskCancellationService taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> false @@ -239,20 +226,23 @@ public void testCancelTasks_cancelsGivenTasks() { queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( + TaskCancellationService taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> false ); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); + when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); taskCancellation.cancelTasks(); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -299,15 +289,17 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( + TaskCancellationService taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> true ); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); @@ -320,6 +312,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); + when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); taskCancellation.cancelTasks(); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); @@ -370,14 +363,15 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( + TaskCancellationService taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> false ); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); @@ -391,6 +385,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); + when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); taskCancellation.cancelTasks(); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); @@ -430,15 +425,17 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { queryGroupLevelViews.put(queryGroupId2, mockView); Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); - TestTaskCancellerImpl taskCancellation = new TestTaskCancellerImpl( + TaskCancellationService taskCancellation = new TaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupLevelViews, + resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups, () -> true ); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); @@ -449,6 +446,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { assertEquals(5678, cancellableTasksFrom1.get(0).getTask().getId()); assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); + when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); taskCancellation.cancelTasks(); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -475,6 +473,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( ); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertTrue(allCancellableTasks.isEmpty()); @@ -497,6 +496,7 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { when(mockView.getResourceUsageData()).thenReturn(Map.of(ResourceType.CPU, cpuUsage, ResourceType.MEMORY, memoryUsage)); queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); assertEquals(2, allCancellableTasks.size()); @@ -526,6 +526,7 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); activeQueryGroups.add(queryGroup2); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup2); assertEquals(0, cancellableTasksFrom.size()); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index ad6ce267b2452..044239e2a1ecd 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -20,7 +20,7 @@ import java.util.List; import java.util.Map; -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.mockito.Mockito.mock; diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java index d109f37243ee2..63913f5a8f67e 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java @@ -32,7 +32,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.opensearch.wlm.QueryGroupTask.QUERY_GROUP_ID_HEADER; -import static org.opensearch.wlm.cancellation.TaskCanceller.MIN_VALUE; +import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.mockito.ArgumentMatchers.anyString; From 0be10233b9ac224e918f9f8d39c0d0a666f70936 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 10 Sep 2024 12:31:03 -0700 Subject: [PATCH 30/47] initial changes Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 4 +- .../org/opensearch/wlm/QueryGroupService.java | 44 ++++++++----------- .../main/java/org/opensearch/wlm/WlmMode.java | 3 ++ .../cancellation/TaskCancellationService.java | 34 ++++++++------ ...adManagementTransportInterceptorTests.java | 12 +++-- .../TaskCancellationServiceTests.java | 32 ++++++-------- ...eryGroupRequestOperationListenerTests.java | 11 +++-- 7 files changed, 71 insertions(+), 69 deletions(-) diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index eb825c0f0cf26..39153573d24c4 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -270,6 +270,7 @@ import org.opensearch.usage.UsageService; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.QueryGroupService; +import org.opensearch.wlm.cancellation.MaximumResourceTaskSelectionStrategy; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.WorkloadManagementTransportInterceptor; @@ -1036,8 +1037,9 @@ protected Node( settings, settingsModule.getClusterSettings() ); + final QueryGroupService queryGroupService = new QueryGroupService( - queryGroupResourceUsageTrackerService, + new org.opensearch.wlm.cancellation.TaskCancellationService(workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), queryGroupResourceUsageTrackerService), clusterService, threadPool, workloadManagementSettings diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index b4fe733177b43..ded84adde72be 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -22,12 +22,10 @@ import org.opensearch.search.backpressure.trackers.NodeDuressTrackers.NodeDuressTracker; import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; -import org.opensearch.wlm.cancellation.DefaultTaskCancellation; -import org.opensearch.wlm.cancellation.DefaultTaskSelectionStrategy; +import org.opensearch.wlm.cancellation.TaskCancellationService; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; import org.opensearch.wlm.stats.QueryGroupStats.QueryGroupStatsHolder; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import java.io.IOException; import java.util.HashMap; @@ -44,7 +42,7 @@ public class QueryGroupService extends AbstractLifecycleComponent implements Clu private final Map queryGroupStateMap; private static final Logger logger = LogManager.getLogger(QueryGroupService.class); - private final QueryGroupResourceUsageTrackerService queryGroupUsageTracker; + private final TaskCancellationService taskCancellationService; private volatile Scheduler.Cancellable scheduledFuture; private final ThreadPool threadPool; private final ClusterService clusterService; @@ -54,30 +52,34 @@ public class QueryGroupService extends AbstractLifecycleComponent implements Clu private NodeDuressTrackers nodeDuressTrackers; public QueryGroupService( - QueryGroupResourceUsageTrackerService queryGroupUsageTracker, + TaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings) { - this(queryGroupUsageTracker, clusterService, threadPool, workloadManagementSettings, new HashMap<>()); + + this(taskCancellationService, clusterService, threadPool, workloadManagementSettings, + new NodeDuressTrackers( + Map.of(ResourceType.CPU, new NodeDuressTracker(() -> + workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, () -> 3), + ResourceType.MEMORY, new NodeDuressTracker( + () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, () -> 3)) + ), + new HashMap<>()); } public QueryGroupService( - QueryGroupResourceUsageTrackerService queryGroupUsageTracker, + TaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings, + NodeDuressTrackers nodeDuressTrackers, Map queryGroupStateMap ) { - this.queryGroupUsageTracker = queryGroupUsageTracker; + this.taskCancellationService = taskCancellationService; this.clusterService = clusterService; this.threadPool = threadPool; this.workloadManagementSettings = workloadManagementSettings; - this.nodeDuressTrackers = new NodeDuressTrackers( - Map.of(ResourceType.CPU, new NodeDuressTracker(() -> - workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, () -> 3), - ResourceType.MEMORY, new NodeDuressTracker( - () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, () -> 3)) - ); + this.nodeDuressTrackers = nodeDuressTrackers; this.activeQueryGroups = getActiveQueryGroupsFromClusterState(); // this logic here is to ensure the proper initialisation of queryGroupState for query groups from persisted metadata @@ -92,18 +94,8 @@ protected void doRun() { if (workloadManagementSettings.getWlmMode() == WlmMode.DISABLED) { return; } - - Map queryGroupLevelResourceUsageViews = queryGroupUsageTracker - .constructQueryGroupLevelUsageViews(); - DefaultTaskCancellation defaultTaskCancellation = new DefaultTaskCancellation( - workloadManagementSettings, - new DefaultTaskSelectionStrategy(), - queryGroupLevelResourceUsageViews, - activeQueryGroups, - deletedQueryGroups, - () -> nodeDuressTrackers.isNodeInDuress() - ); - defaultTaskCancellation.cancelTasks(); +// taskCancellationService.cancelTasks(activeQueryGroups, deletedQueryGroups); + taskCancellationService.cancelTasks(() -> nodeDuressTrackers.isNodeInDuress()); } /** diff --git a/server/src/main/java/org/opensearch/wlm/WlmMode.java b/server/src/main/java/org/opensearch/wlm/WlmMode.java index 06837ed2cacc4..121686458692b 100644 --- a/server/src/main/java/org/opensearch/wlm/WlmMode.java +++ b/server/src/main/java/org/opensearch/wlm/WlmMode.java @@ -8,9 +8,12 @@ package org.opensearch.wlm; +import org.opensearch.common.annotation.PublicApi; + /** * Enum to hold the values whether wlm is enabled or not */ +@PublicApi(since = "2.18.0") public enum WlmMode { ENABLED("enabled"), MONITOR_ONLY("monitor_only"), diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java index 24cf9f3746404..fa10230167567 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java @@ -9,6 +9,9 @@ package org.opensearch.wlm.cancellation; import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.monitor.jvm.JvmStats; +import org.opensearch.monitor.process.ProcessProbe; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; @@ -18,10 +21,7 @@ import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.function.BooleanSupplier; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -53,38 +53,46 @@ public class TaskCancellationService { private final QueryGroupResourceUsageTrackerService resourceUsageTrackerService; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object Map queryGroupLevelResourceUsageViews; - private final Collection activeQueryGroups; - private final Collection deletedQueryGroups; - private BooleanSupplier isNodeInDuress; + private Collection activeQueryGroups; + private Collection deletedQueryGroups; + + public TaskCancellationService( + WorkloadManagementSettings workloadManagementSettings, + TaskSelectionStrategy taskSelectionStrategy, + QueryGroupResourceUsageTrackerService resourceUsageTrackerService + ) { + this(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, Collections.emptySet(), Collections.emptySet()); + } public TaskCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, QueryGroupResourceUsageTrackerService resourceUsageTrackerService, Collection activeQueryGroups, - Collection deletedQueryGroups, - BooleanSupplier isNodeInDuress + Collection deletedQueryGroups ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; this.resourceUsageTrackerService = resourceUsageTrackerService; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; - this.isNodeInDuress = isNodeInDuress; } /** * Cancel tasks based on the implemented strategy. */ - public final void cancelTasks() { +// public final void cancelTasks(Collection activeQueryGroups, Collection deletedQueryGroups) { + public final void cancelTasks(BooleanSupplier isNodeInDuress) { +// this.activeQueryGroups = activeQueryGroups; + this.deletedQueryGroups = deletedQueryGroups; queryGroupLevelResourceUsageViews = resourceUsageTrackerService.constructQueryGroupLevelUsageViews(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits cancelTasks(ResiliencyMode.ENFORCED); // if the node is in duress, cancel tasks accordingly. - handleNodeDuress(); + handleNodeDuress(isNodeInDuress); } - private void handleNodeDuress() { + private void handleNodeDuress(BooleanSupplier isNodeInDuress) { if (!isNodeInDuress.getAsBoolean()) { return; } diff --git a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java index 6d719a8be9df2..3a16c4e9a9691 100644 --- a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java +++ b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java @@ -17,17 +17,16 @@ import org.opensearch.transport.TransportRequest; import org.opensearch.transport.TransportRequestHandler; import org.opensearch.wlm.WorkloadManagementTransportInterceptor.RequestHandler; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; +import org.opensearch.wlm.cancellation.TaskCancellationService; import java.util.Collections; -import java.util.HashMap; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.opensearch.threadpool.ThreadPool.Names.SAME; public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestCase { - private QueryGroupResourceUsageTrackerService mockQueryGroupUsageTracker; + private TaskCancellationService mockTaskCancellationService; private ClusterService mockClusterService; private ThreadPool mockThreadPool; private WorkloadManagementSettings mockWorkloadManagementSettings; @@ -36,7 +35,7 @@ public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestC public void setUp() throws Exception { super.setUp(); - mockQueryGroupUsageTracker = mock(QueryGroupResourceUsageTrackerService.class); + mockTaskCancellationService = mock(TaskCancellationService.class); mockClusterService = mock(ClusterService.class); mockThreadPool = mock(ThreadPool.class); mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); @@ -48,11 +47,10 @@ public void setUp() throws Exception { when(metadata.queryGroups()).thenReturn(Collections.emptyMap()); sut = new WorkloadManagementTransportInterceptor(threadPool, new QueryGroupService( - mockQueryGroupUsageTracker, + mockTaskCancellationService, mockClusterService, mockThreadPool, - mockWorkloadManagementSettings, - new HashMap<>() + mockWorkloadManagementSettings ) ); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java index 07764b93a41ea..0914d3f0785ed 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java @@ -66,8 +66,7 @@ public void setup() { new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); } @@ -198,8 +197,7 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); @@ -209,14 +207,15 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; double cpuUsage = 0.011; - double memoryUsage = 0.0; + double memoryUsage = 0.011; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold, + ResourceType.MEMORY, threshold)), 1L ); @@ -231,8 +230,7 @@ public void testCancelTasks_cancelsGivenTasks() { new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -243,7 +241,8 @@ public void testCancelTasks_cancelsGivenTasks() { assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); +// taskCancellation.cancelTasks(activeQueryGroups, deletedQueryGroups); + taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); } @@ -294,8 +293,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> true + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -313,7 +311,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -368,8 +366,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -386,7 +383,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -430,8 +427,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> true + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -447,7 +443,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); assertTrue(cancellableTasksFrom1.get(0).getTask().isCancelled()); diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 6359f8a8f9752..175ec88a6733d 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -21,6 +21,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.cancellation.TaskCancellationService; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; @@ -35,7 +36,7 @@ public class QueryGroupRequestOperationListenerTests extends OpenSearchTestCase public static final int ITERATIONS = 20; ThreadPool testThreadPool; QueryGroupService queryGroupService; - private QueryGroupResourceUsageTrackerService mockQueryGroupUsageTracker; + private TaskCancellationService taskCancellationService; private ClusterService mockClusterService; private WorkloadManagementSettings mockWorkloadManagementSettings; Map queryGroupStateMap; @@ -44,7 +45,7 @@ public class QueryGroupRequestOperationListenerTests extends OpenSearchTestCase public void setUp() throws Exception { super.setUp(); - mockQueryGroupUsageTracker = mock(QueryGroupResourceUsageTrackerService.class); + taskCancellationService = mock(TaskCancellationService.class); mockClusterService = mock(ClusterService.class); mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); queryGroupStateMap = new HashMap<>(); @@ -102,10 +103,11 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { queryGroupStateMap.put(testQueryGroupId, new QueryGroupState()); setupMockedQueryGroupsFromClusterState(); queryGroupService = new QueryGroupService( - mockQueryGroupUsageTracker, + taskCancellationService, mockClusterService, testThreadPool, mockWorkloadManagementSettings, + null, queryGroupStateMap ); @@ -190,10 +192,11 @@ private void assertSuccess( setupMockedQueryGroupsFromClusterState(); queryGroupService = new QueryGroupService( - mockQueryGroupUsageTracker, + taskCancellationService, mockClusterService, testThreadPool, mockWorkloadManagementSettings, + null, queryGroupStateMap ); From caf5914f7963a85e417f6dfa32f2c8e436f688a3 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 10 Sep 2024 16:52:49 -0700 Subject: [PATCH 31/47] refactor changes and logical bug fix Signed-off-by: Kaushal Kumar --- .../org/opensearch/wlm/QueryGroupTask.java | 22 ++- .../MaximumResourceTaskSelectionStrategy.java | 2 +- ...=> QueryGroupTaskCancellationService.java} | 150 +++++++----------- .../wlm/tracker/CpuUsageCalculator.java | 9 +- ...QueryGroupResourceUsageTrackerService.java | 4 +- .../wlm/tracker/ResourceUsageCalculator.java | 9 -- ...QueryGroupLevelResourceUsageViewTests.java | 2 +- ...mumResourceTaskSelectionStrategyTests.java | 2 +- ...eryGroupTaskCancellationServiceTests.java} | 81 ++++------ .../tracker/ResourceUsageCalculatorTests.java | 13 +- ...rceUsageCalculatorTrackerServiceTests.java | 23 ++- 11 files changed, 124 insertions(+), 193 deletions(-) rename server/src/main/java/org/opensearch/wlm/cancellation/{TaskCancellationService.java => QueryGroupTaskCancellationService.java} (59%) rename server/src/test/java/org/opensearch/wlm/cancellation/{TaskCancellationServiceTests.java => QueryGroupTaskCancellationServiceTests.java} (91%) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index 6ffc2fe9e802e..842a320a65075 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -18,6 +18,7 @@ import java.util.Map; import java.util.Optional; +import java.util.function.LongSupplier; import java.util.function.Supplier; import static org.opensearch.search.SearchService.NO_TIMEOUT; @@ -31,10 +32,11 @@ public class QueryGroupTask extends CancellableTask { private static final Logger logger = LogManager.getLogger(QueryGroupTask.class); public static final String QUERY_GROUP_ID_HEADER = "queryGroupId"; public static final Supplier DEFAULT_QUERY_GROUP_ID_SUPPLIER = () -> "DEFAULT_QUERY_GROUP"; + private LongSupplier nanoTimeSupplier; private String queryGroupId; public QueryGroupTask(long id, String type, String action, String description, TaskId parentTaskId, Map headers) { - this(id, type, action, description, parentTaskId, headers, NO_TIMEOUT); + this(id, type, action, description, parentTaskId, headers, NO_TIMEOUT, System::nanoTime); } public QueryGroupTask( @@ -49,6 +51,20 @@ public QueryGroupTask( super(id, type, action, description, parentTaskId, headers, cancelAfterTimeInterval); } + public QueryGroupTask( + long id, + String type, + String action, + String description, + TaskId parentTaskId, + Map headers, + TimeValue cancelAfterTimeInterval, + LongSupplier nanoTimeSupplier + ) { + super(id, type, action, description, parentTaskId, headers, cancelAfterTimeInterval); + this.nanoTimeSupplier = nanoTimeSupplier; + } + /** * This method should always be called after calling setQueryGroupId at least once on this object * @return task queryGroupId @@ -71,6 +87,10 @@ public final void setQueryGroupId(final ThreadContext threadContext) { .orElse(DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); } + public long getElapsedTime() { + return nanoTimeSupplier.getAsLong() - getStartTimeNanos(); + } + @Override public boolean shouldCancelChildrenOnCancellation() { return false; diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java index 7216984da8aca..ffb326c07e7ac 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategy.java @@ -17,7 +17,7 @@ import java.util.List; import java.util.stream.Collectors; -import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; +import static org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService.MIN_VALUE; /** * Represents the highest resource consuming task first selection strategy. diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java similarity index 59% rename from server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java rename to server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index 24cf9f3746404..a2c97c8d8635b 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/TaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -45,7 +45,7 @@ * @see QueryGroup * @see ResourceType */ -public class TaskCancellationService { +public class QueryGroupTaskCancellationService { public static final double MIN_VALUE = 1e-9; private final WorkloadManagementSettings workloadManagementSettings; @@ -55,36 +55,33 @@ public class TaskCancellationService { Map queryGroupLevelResourceUsageViews; private final Collection activeQueryGroups; private final Collection deletedQueryGroups; - private BooleanSupplier isNodeInDuress; - public TaskCancellationService( + public QueryGroupTaskCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, QueryGroupResourceUsageTrackerService resourceUsageTrackerService, Collection activeQueryGroups, - Collection deletedQueryGroups, - BooleanSupplier isNodeInDuress + Collection deletedQueryGroups ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; this.resourceUsageTrackerService = resourceUsageTrackerService; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; - this.isNodeInDuress = isNodeInDuress; } /** * Cancel tasks based on the implemented strategy. */ - public final void cancelTasks() { + public final void cancelTasks(BooleanSupplier isNodeInDuress) { queryGroupLevelResourceUsageViews = resourceUsageTrackerService.constructQueryGroupLevelUsageViews(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits cancelTasks(ResiliencyMode.ENFORCED); // if the node is in duress, cancel tasks accordingly. - handleNodeDuress(); + handleNodeDuress(isNodeInDuress); } - private void handleNodeDuress() { + private void handleNodeDuress(BooleanSupplier isNodeInDuress) { if (!isNodeInDuress.getAsBoolean()) { return; } @@ -109,7 +106,9 @@ private void cancelTasksFromDeletedQueryGroups() { * @return List of tasks that can be cancelled */ List getAllCancellableTasks(ResiliencyMode resiliencyMode) { - return getAllCancellableTasks(getQueryGroupsToCancelFrom(resiliencyMode)); + return getAllCancellableTasks( + activeQueryGroups.stream().filter(queryGroup -> queryGroup.getResiliencyMode() == resiliencyMode).collect(Collectors.toList()) + ); } /** @@ -118,106 +117,63 @@ List getAllCancellableTasks(ResiliencyMode resiliencyMode) { * @return List of tasks that can be cancelled */ List getAllCancellableTasks(Collection queryGroups) { - return queryGroups.stream().flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()).collect(Collectors.toList()); - } - - /** - * returns the list of QueryGroups breaching their resource limits. - * - * @return List of QueryGroups - */ - private List getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMode) { - final List queryGroupsToCancelFrom = new ArrayList<>(); - - for (QueryGroup queryGroup : this.activeQueryGroups) { - if (queryGroup.getResiliencyMode() != resiliencyMode) { - continue; - } + List taskCancellations = new ArrayList<>(); + for (QueryGroup queryGroup : queryGroups) { + final List reasons = new ArrayList<>(); + List selectedTasks = new ArrayList<>(); for (ResourceType resourceType : TRACKED_RESOURCES) { - if (queryGroup.getResourceLimits().containsKey(resourceType)) { - if (shouldCancelTasks(queryGroup, resourceType)) { - queryGroupsToCancelFrom.add(queryGroup); - break; - } - + // We need to consider the already selected tasks since those tasks also consumed the resources + double excessUsage = getExcessUsage(queryGroup, resourceType) - resourceType.getResourceUsageCalculator() + .calculateResourceUsage(selectedTasks); + if (excessUsage > MIN_VALUE) { + reasons.add(new TaskCancellation.Reason(generateReasonString(queryGroup, resourceType), 1)); + // TODO: We will need to add the cancellation callback for these resources for the queryGroup to reflect stats + + // Only add tasks not already added to avoid double cancellations + selectedTasks.addAll( + taskSelectionStrategy.selectTasksForCancellation(getTasksFor(queryGroup), excessUsage, resourceType) + .stream() + .filter(x -> selectedTasks.stream().noneMatch(y -> x.getId() != y.getId())) + .collect(Collectors.toList()) + ); } } - } - - return queryGroupsToCancelFrom; - } - - private void cancelTasks(ResiliencyMode resiliencyMode) { - cancelTasks(getAllCancellableTasks(resiliencyMode)); - } - - private void cancelTasks(List cancellableTasks) { - cancellableTasks.forEach(TaskCancellation::cancel); - } - /** - * Get cancellable tasks from a specific queryGroup. - * - * @param queryGroup The QueryGroup from which to get cancellable tasks - * @return List of tasks that can be cancelled - */ - List getCancellableTasksFrom(QueryGroup queryGroup) { - return TRACKED_RESOURCES.stream() - .filter(resourceType -> shouldCancelTasks(queryGroup, resourceType)) - .flatMap(resourceType -> getTaskCancellations(queryGroup, resourceType).stream()) - .collect(Collectors.toList()); - } - - private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) { - return getExcessUsage(queryGroup, resourceType) > 0; - } - - private List getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) { - List selectedTasksToCancel = taskSelectionStrategy.selectTasksForCancellation( - queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(), - getExcessUsage(queryGroup, resourceType), - resourceType - ); - List taskCancellations = new ArrayList<>(); - for (QueryGroupTask task : selectedTasksToCancel) { - String cancellationReason = createCancellationReason(queryGroup, task, resourceType); - taskCancellations.add(createTaskCancellation(task, cancellationReason)); + if (!reasons.isEmpty()) { + taskCancellations.addAll( + selectedTasks.stream().map(task -> createTaskCancellation(task, reasons)).collect(Collectors.toList()) + ); + } } return taskCancellations; } - private String createCancellationReason(QueryGroup querygroup, QueryGroupTask task, ResourceType resourceType) { - Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType); - return "[Workload Management] Cancelling Task ID : " - + task.getId() - + " from QueryGroup ID : " - + querygroup.get_id() - + " breached the resource limit of : " - + thresholdInPercent - + " for resource type : " + private String generateReasonString(QueryGroup queryGroup, ResourceType resourceType) { + final double currentUsage = getCurrentUsage(queryGroup, resourceType); + return "QueryGroup ID : " + + queryGroup.get_id() + + " breached the resource limit: (" + + currentUsage + + " > " + + queryGroup.getResourceLimits().get(resourceType) + + ") for resource type : " + resourceType.getName(); } - private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) { - return querygroup.getResourceLimits().get(resourceType) * 100; + private List getTasksFor(QueryGroup queryGroup) { + return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(); } - private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) { - return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel)); + private void cancelTasks(ResiliencyMode resiliencyMode) { + cancelTasks(getAllCancellableTasks(resiliencyMode)); } - List getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) { - List tasks = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(); + private void cancelTasks(List cancellableTasks) { + cancellableTasks.forEach(TaskCancellation::cancel); + } - List taskCancellations = new ArrayList<>(); - for (QueryGroupTask task : tasks) { - String cancellationReason = "[Workload Management] Cancelling Task ID : " - + task.getId() - + " from QueryGroup ID : " - + queryGroup.get_id(); - taskCancellations.add(createTaskCancellation(task, cancellationReason)); - } - return taskCancellations; + private TaskCancellation createTaskCancellation(CancellableTask task, List reasons) { + return new TaskCancellation(task, reasons, List.of(this::callbackOnCancel)); } private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) { @@ -225,10 +181,12 @@ private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { return 0; } + return getCurrentUsage(queryGroup, resourceType) - getNormalisedThreshold(queryGroup, resourceType); + } + private double getCurrentUsage(QueryGroup queryGroup, ResourceType resourceType) { final QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()); - final double currentUsage = queryGroupResourceUsageView.getResourceUsageData().get(resourceType); - return currentUsage - getNormalisedThreshold(queryGroup, resourceType); + return queryGroupResourceUsageView.getResourceUsageData().get(resourceType); } /** diff --git a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java index 772e698c324b3..05c84cd767b1f 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/CpuUsageCalculator.java @@ -12,7 +12,6 @@ import org.opensearch.wlm.QueryGroupTask; import java.util.List; -import java.util.function.LongSupplier; /** * class to help make cpu usage calculations for the query group @@ -21,17 +20,11 @@ public class CpuUsageCalculator extends ResourceUsageCalculator { // This value should be initialised at the start time of the process and be used throughout the codebase public static final int PROCESSOR_COUNT = Runtime.getRuntime().availableProcessors(); public static final CpuUsageCalculator INSTANCE = new CpuUsageCalculator(); - private LongSupplier nanoTimeSupplier; private CpuUsageCalculator() {} - public void setNanoTimeSupplier(LongSupplier nanoTimeSupplier) { - this.nanoTimeSupplier = nanoTimeSupplier; - } - @Override public double calculateResourceUsage(List tasks) { - assert nanoTimeSupplier != null : "nanoTimeSupplier has to be set in order to calculate the resource usage"; double usage = tasks.stream().mapToDouble(this::calculateTaskResourceUsage).sum(); usage /= PROCESSOR_COUNT; @@ -40,6 +33,6 @@ public double calculateResourceUsage(List tasks) { @Override public double calculateTaskResourceUsage(QueryGroupTask task) { - return (1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / (nanoTimeSupplier.getAsLong() - task.getStartTimeNanos()); + return (1.0f * task.getTotalResourceUtilization(ResourceStats.CPU)) / task.getElapsedTime(); } } diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index ddf65e61a1275..b23d9ff342139 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -18,7 +18,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.function.LongSupplier; import java.util.stream.Collectors; /** @@ -33,9 +32,8 @@ public class QueryGroupResourceUsageTrackerService { * * @param taskResourceTrackingService Service that helps track resource usage of tasks running on a node. */ - public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService, LongSupplier nanoTimeSupplier) { + public QueryGroupResourceUsageTrackerService(TaskResourceTrackingService taskResourceTrackingService) { this.taskResourceTrackingService = taskResourceTrackingService; - ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(nanoTimeSupplier); } /** diff --git a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java index e68693b9a6433..bc8317cbfbf92 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/ResourceUsageCalculator.java @@ -12,7 +12,6 @@ import org.opensearch.wlm.QueryGroupTask; import java.util.List; -import java.util.function.LongSupplier; /** * This class is used to track query group level resource usage @@ -32,12 +31,4 @@ public abstract class ResourceUsageCalculator { * @return task level resource usage */ public abstract double calculateTaskResourceUsage(QueryGroupTask task); - - /** - * Since only few implementations might need this - * @param nanoTimeSupplier - */ - public void setNanoTimeSupplier(LongSupplier nanoTimeSupplier) { - - } } diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java index 77fc6ac6e535b..0c7eb721806d5 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupLevelResourceUsageViewTests.java @@ -14,7 +14,7 @@ import java.util.List; import java.util.Map; -import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; +import static org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.opensearch.wlm.tracker.ResourceUsageCalculatorTests.createMockTaskWithResourceStats; diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java index 05d080d51aa65..dc79822c59c49 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/MaximumResourceTaskSelectionStrategyTests.java @@ -23,7 +23,7 @@ import java.util.List; import java.util.stream.IntStream; -import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; +import static org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; public class MaximumResourceTaskSelectionStrategyTests extends OpenSearchTestCase { diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java similarity index 91% rename from server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java rename to server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index 07764b93a41ea..f7a49235efc69 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/TaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -9,7 +9,6 @@ package org.opensearch.wlm.cancellation; import org.opensearch.action.search.SearchAction; -import org.opensearch.action.search.SearchTask; import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.TaskId; import org.opensearch.tasks.TaskCancellation; @@ -36,7 +35,7 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class TaskCancellationServiceTests extends OpenSearchTestCase { +public class QueryGroupTaskCancellationServiceTests extends OpenSearchTestCase { private static final String queryGroupId1 = "queryGroup1"; private static final String queryGroupId2 = "queryGroup2"; @@ -45,7 +44,7 @@ public class TaskCancellationServiceTests extends OpenSearchTestCase { private Map queryGroupLevelViews; private Set activeQueryGroups; private Set deletedQueryGroups; - private TaskCancellationService taskCancellation; + private QueryGroupTaskCancellationService taskCancellation; private WorkloadManagementSettings workloadManagementSettings; private QueryGroupResourceUsageTrackerService resourceUsageTrackerService; @@ -57,17 +56,15 @@ public void setup() { deletedQueryGroups = new HashSet<>(); clock = new TestClock(); - ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(clock::getTime); when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); resourceUsageTrackerService = mock(QueryGroupResourceUsageTrackerService.class); - taskCancellation = new TaskCancellationService( + taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); } @@ -90,18 +87,11 @@ public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndSco queryGroupLevelViews.put(queryGroupId1, mockView); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(List.of(queryGroup1)); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - assertEquals( - "[Workload Management] Cancelling Task ID : " - + cancellableTasksFrom.get(0).getTask().getId() - + " from QueryGroup ID : queryGroup1" - + " breached the resource limit of : 10.0 for resource type : cpu", - cancellableTasksFrom.get(0).getReasonString() - ); - assertEquals(5, cancellableTasksFrom.get(0).getReasons().get(0).getCancellationScore()); + assertEquals(1, cancellableTasksFrom.get(0).getReasons().get(0).getCancellationScore()); } public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { @@ -122,7 +112,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThreshold() { queryGroupLevelViews.put(queryGroupId1, mockView); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(List.of(queryGroup1)); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -172,7 +162,7 @@ public void testGetCancellableTasksFrom_returnsNoTasksWhenNotBreachingThreshold( activeQueryGroups.add(queryGroup1); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup1); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(List.of(queryGroup1)); assertTrue(cancellableTasksFrom.isEmpty()); } @@ -193,13 +183,12 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { activeQueryGroups.add(queryGroup1); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - TaskCancellationService taskCancellation = new TaskCancellationService( + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); @@ -209,14 +198,14 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { public void testCancelTasks_cancelsGivenTasks() { ResourceType resourceType = ResourceType.CPU; double cpuUsage = 0.011; - double memoryUsage = 0.0; + double memoryUsage = 0.011; Double threshold = 0.01; QueryGroup queryGroup1 = new QueryGroup( "testQueryGroup", queryGroupId1, - new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold)), + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(resourceType, threshold, ResourceType.MEMORY, threshold)), 1L ); @@ -226,13 +215,12 @@ public void testCancelTasks_cancelsGivenTasks() { queryGroupLevelViews.put(queryGroupId1, mockView); activeQueryGroups.add(queryGroup1); - TaskCancellationService taskCancellation = new TaskCancellationService( + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -243,7 +231,7 @@ public void testCancelTasks_cancelsGivenTasks() { assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); } @@ -289,13 +277,12 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TaskCancellationService taskCancellation = new TaskCancellationService( + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> true + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -305,15 +292,13 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - List cancellableTasksFromDeletedQueryGroups = taskCancellation.getTaskCancellationsForDeletedQueryGroup( - deletedQueryGroup - ); + List cancellableTasksFromDeletedQueryGroups = taskCancellation.getAllCancellableTasks(List.of(deletedQueryGroup)); assertEquals(2, cancellableTasksFromDeletedQueryGroups.size()); assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -363,13 +348,12 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN activeQueryGroups.add(activeQueryGroup); deletedQueryGroups.add(deletedQueryGroup); - TaskCancellationService taskCancellation = new TaskCancellationService( + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> false + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -378,15 +362,13 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - List cancellableTasksFromDeletedQueryGroups = taskCancellation.getTaskCancellationsForDeletedQueryGroup( - deletedQueryGroup - ); + List cancellableTasksFromDeletedQueryGroups = taskCancellation.getAllCancellableTasks(List.of(deletedQueryGroup)); assertEquals(2, cancellableTasksFromDeletedQueryGroups.size()); assertEquals(1000, cancellableTasksFromDeletedQueryGroups.get(0).getTask().getId()); assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -425,13 +407,12 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { queryGroupLevelViews.put(queryGroupId2, mockView); Collections.addAll(activeQueryGroups, queryGroup1, queryGroup2); - TaskCancellationService taskCancellation = new TaskCancellationService( + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), resourceUsageTrackerService, activeQueryGroups, - deletedQueryGroups, - () -> true + deletedQueryGroups ); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -447,7 +428,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); - taskCancellation.cancelTasks(); + taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); assertTrue(cancellableTasksFrom1.get(0).getTask().isCancelled()); @@ -528,7 +509,7 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou activeQueryGroups.add(queryGroup2); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getCancellableTasksFrom(queryGroup2); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(List.of(queryGroup2)); assertEquals(0, cancellableTasksFrom.size()); } @@ -546,13 +527,15 @@ private QueryGroupLevelResourceUsageView createResourceUsageViewMock(ResourceTyp } private QueryGroupTask getRandomSearchTask(long id) { - return new SearchTask( + return new QueryGroupTask( id, "transport", SearchAction.NAME, - () -> "test description", + "test description", new TaskId(randomLong() + ":" + randomLong()), - Collections.emptyMap() + Collections.emptyMap(), + null, + clock::getTime ); } } diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java index 044239e2a1ecd..21d9717a1aaca 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTests.java @@ -8,19 +8,15 @@ package org.opensearch.wlm.tracker; -import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.core.tasks.resourcetracker.ResourceStats; import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.wlm.MutableQueryGroupFragment; -import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import java.util.List; -import java.util.Map; -import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; +import static org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.mockito.Mockito.mock; @@ -32,14 +28,11 @@ public void testQueryGroupCpuUsage() { TestClock clock = new TestClock(); long fastForwardTime = PROCESSOR_COUNT * 200L; clock.fastForwardBy(fastForwardTime); - QueryGroup queryGroup = new QueryGroup( - "testQG", - new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.5 / PROCESSOR_COUNT)) - ); + double expectedQueryGroupCpuUsage = 1.0 / PROCESSOR_COUNT; QueryGroupTask mockTask = createMockTaskWithResourceStats(QueryGroupTask.class, fastForwardTime, 200, 0, 123); - ResourceType.CPU.getResourceUsageCalculator().setNanoTimeSupplier(clock::getTime); + when(mockTask.getElapsedTime()).thenReturn(fastForwardTime); double actualUsage = ResourceType.CPU.getResourceUsageCalculator().calculateResourceUsage(List.of(mockTask)); assertEquals(expectedQueryGroupCpuUsage, actualUsage, MIN_VALUE); diff --git a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java index 63913f5a8f67e..fe72bd6e710c8 100644 --- a/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/tracker/ResourceUsageCalculatorTrackerServiceTests.java @@ -9,10 +9,8 @@ package org.opensearch.wlm.tracker; import org.opensearch.action.search.SearchShardTask; -import org.opensearch.action.search.SearchTask; import org.opensearch.common.util.concurrent.ThreadContext; import org.opensearch.core.tasks.resourcetracker.ResourceStats; -import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.Task; import org.opensearch.tasks.TaskResourceTrackingService; import org.opensearch.test.OpenSearchTestCase; @@ -32,7 +30,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.opensearch.wlm.QueryGroupTask.QUERY_GROUP_ID_HEADER; -import static org.opensearch.wlm.cancellation.TaskCancellationService.MIN_VALUE; +import static org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService.MIN_VALUE; import static org.opensearch.wlm.tracker.CpuUsageCalculator.PROCESSOR_COUNT; import static org.opensearch.wlm.tracker.MemoryUsageCalculator.HEAP_SIZE_BYTES; import static org.mockito.ArgumentMatchers.anyString; @@ -66,7 +64,7 @@ public void setup() { settings = mock(WorkloadManagementSettings.class); threadPool = new TestThreadPool(getTestName()); mockTaskResourceTrackingService = mock(TaskResourceTrackingService.class); - queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService, clock::getTime); + queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService(mockTaskResourceTrackingService); } @After @@ -76,10 +74,10 @@ public void cleanup() { public void testConstructQueryGroupLevelViews_CreatesQueryGroupLevelUsageView_WhenTasksArePresent() { List queryGroupIds = List.of("queryGroup1", "queryGroup2", "queryGroup3"); + clock.fastForwardBy(2000); Map activeSearchShardTasks = createActiveSearchShardTasks(queryGroupIds); when(mockTaskResourceTrackingService.getResourceAwareTasks()).thenReturn(activeSearchShardTasks); - clock.fastForwardBy(2000); Map stringQueryGroupLevelResourceUsageViewMap = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); @@ -107,10 +105,10 @@ public void testConstructQueryGroupLevelViews_CreatesQueryGroupLevelUsageView_Wh public void testConstructQueryGroupLevelUsageViews_WithTasksHavingDifferentResourceUsage() { Map activeSearchShardTasks = new HashMap<>(); + clock.fastForwardBy(2000); activeSearchShardTasks.put(1L, createMockTask(SearchShardTask.class, 100, 200, "queryGroup1")); activeSearchShardTasks.put(2L, createMockTask(SearchShardTask.class, 200, 400, "queryGroup1")); when(mockTaskResourceTrackingService.getResourceAwareTasks()).thenReturn(activeSearchShardTasks); - clock.fastForwardBy(2000); Map queryGroupViews = queryGroupResourceUsageTrackerService .constructQueryGroupLevelUsageViews(); @@ -138,19 +136,16 @@ private Map createActiveSearchShardTasks(List queryGroupIds) return activeSearchShardTasks; } - private T createMockTask(Class type, long cpuUsage, long heapUsage, String queryGroupId) { + private T createMockTask(Class type, long cpuUsage, long heapUsage, String queryGroupId) { T task = mock(type); - if (task instanceof SearchTask || task instanceof SearchShardTask) { - // Stash the current thread context to ensure that any existing context is preserved and restored after setting the query group - // ID. - try (ThreadContext.StoredContext ignore = threadPool.getThreadContext().stashContext()) { - threadPool.getThreadContext().putHeader(QUERY_GROUP_ID_HEADER, queryGroupId); - ((QueryGroupTask) task).setQueryGroupId(threadPool.getThreadContext()); - } + try (ThreadContext.StoredContext ignore = threadPool.getThreadContext().stashContext()) { + threadPool.getThreadContext().putHeader(QUERY_GROUP_ID_HEADER, queryGroupId); + task.setQueryGroupId(threadPool.getThreadContext()); } when(task.getTotalResourceUtilization(ResourceStats.CPU)).thenReturn(cpuUsage); when(task.getTotalResourceUtilization(ResourceStats.MEMORY)).thenReturn(heapUsage); when(task.getStartTimeNanos()).thenReturn((long) 0); + when(task.getElapsedTime()).thenReturn(clock.getTime()); AtomicBoolean isCancelled = new AtomicBoolean(false); doAnswer(invocation -> { From 6381e17cc1c6e66967d4d6d63f92899528014e18 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 10 Sep 2024 23:28:10 -0700 Subject: [PATCH 32/47] add chanegs Signed-off-by: Kaushal Kumar --- .../org/opensearch/wlm/QueryGroupService.java | 36 +++++++++---------- .../wlm/WorkloadManagementSettings.java | 32 +++++++++++++++++ 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index ded84adde72be..a4b385a66010d 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -47,9 +47,9 @@ public class QueryGroupService extends AbstractLifecycleComponent implements Clu private final ThreadPool threadPool; private final ClusterService clusterService; private final WorkloadManagementSettings workloadManagementSettings; - private Set activeQueryGroups = new HashSet<>(); - private Set deletedQueryGroups = new HashSet<>(); - private NodeDuressTrackers nodeDuressTrackers; + private final Set activeQueryGroups; + private final Set deletedQueryGroups; + private final NodeDuressTrackers nodeDuressTrackers; public QueryGroupService( TaskCancellationService taskCancellationService, @@ -60,11 +60,16 @@ public QueryGroupService( this(taskCancellationService, clusterService, threadPool, workloadManagementSettings, new NodeDuressTrackers( Map.of(ResourceType.CPU, new NodeDuressTracker(() -> - workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, () -> 3), + workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, + workloadManagementSettings::getDuressStreak), ResourceType.MEMORY, new NodeDuressTracker( - () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, () -> 3)) + () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, + workloadManagementSettings::getDuressStreak)) ), - new HashMap<>()); + new HashMap<>(), + new HashSet<>(clusterService.state().metadata().queryGroups().values()), + new HashSet<>() + ); } public QueryGroupService( @@ -73,18 +78,19 @@ public QueryGroupService( ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings, NodeDuressTrackers nodeDuressTrackers, - Map queryGroupStateMap + Map queryGroupStateMap, + Set activeQueryGroups, + Set deletedQueryGroups ) { this.taskCancellationService = taskCancellationService; this.clusterService = clusterService; this.threadPool = threadPool; this.workloadManagementSettings = workloadManagementSettings; this.nodeDuressTrackers = nodeDuressTrackers; - this.activeQueryGroups = getActiveQueryGroupsFromClusterState(); + this.activeQueryGroups = activeQueryGroups; + this.deletedQueryGroups = deletedQueryGroups; - // this logic here is to ensure the proper initialisation of queryGroupState for query groups from persisted metadata this.queryGroupStateMap = queryGroupStateMap; - this.activeQueryGroups.forEach(queryGroup -> queryGroupStateMap.put(queryGroup.get_id(), new QueryGroupState())); } /** @@ -95,7 +101,7 @@ protected void doRun() { return; } // taskCancellationService.cancelTasks(activeQueryGroups, deletedQueryGroups); - taskCancellationService.cancelTasks(() -> nodeDuressTrackers.isNodeInDuress()); + taskCancellationService.cancelTasks(nodeDuressTrackers::isNodeInDuress); } /** @@ -202,12 +208,4 @@ public void rejectIfNeeded(String queryGroupId) { throw new OpenSearchRejectedExecutionException("QueryGroup " + queryGroupId + " is already contended." + reason.toString()); } } - - protected Set getDeletedQueryGroups() { - return deletedQueryGroups; - } - - protected Set getActiveQueryGroups() { - return activeQueryGroups; - } } diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index 8a52a2b151c35..317e8defd8a4f 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -35,6 +35,19 @@ public class WorkloadManagementSettings { private Double nodeLevelCpuCancellationThreshold; private Double nodeLevelCpuRejectionThreshold; + + /** + * Setting name for QueryGroupService node duress streak + */ + public static final String QUERYGROUP_SERVICE_DURESS_STREAK_SETTING_NAME = "wlm.query_group.service.duress_streak"; + private int duressStreak; + public static final Setting QUERYGROUP_SERVICE_DURESS_STREAK_SETTING = Setting.intSetting( + QUERYGROUP_SERVICE_DURESS_STREAK_SETTING_NAME, + 3, + 3, + Setting.Property.NodeScope + ); + /** * Setting name for Query Group Service run interval */ @@ -134,6 +147,7 @@ public WorkloadManagementSettings(Settings settings, ClusterSettings clusterSett nodeLevelCpuCancellationThreshold = NODE_LEVEL_CPU_CANCELLATION_THRESHOLD.get(settings); nodeLevelCpuRejectionThreshold = NODE_LEVEL_CPU_REJECTION_THRESHOLD.get(settings); this.queryGroupServiceRunInterval = TimeValue.timeValueMillis(QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING.get(settings)); + duressStreak = QUERYGROUP_SERVICE_DURESS_STREAK_SETTING.get(settings); ensureRejectionThresholdIsLessThanCancellation( nodeLevelMemoryRejectionThreshold, @@ -154,6 +168,24 @@ public WorkloadManagementSettings(Settings settings, ClusterSettings clusterSett clusterSettings.addSettingsUpdateConsumer(NODE_LEVEL_CPU_REJECTION_THRESHOLD, this::setNodeLevelCpuRejectionThreshold); clusterSettings.addSettingsUpdateConsumer(WLM_MODE_SETTING, this::setWlmMode); clusterSettings.addSettingsUpdateConsumer(QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING, this::setQueryGroupServiceRunInterval); + clusterSettings.addSettingsUpdateConsumer(QUERYGROUP_SERVICE_DURESS_STREAK_SETTING, this::setDuressStreak); + } + + + /** + * node duress streak getter + * @return current duressStreak value + */ + public int getDuressStreak() { + return duressStreak; + } + + /** + * node duress streak setter + * @param duressStreak new value + */ + public void setDuressStreak(int duressStreak) { + this.duressStreak = duressStreak; } /** From b78ca02c72ca345b4a378e3f818e9cfd1225b732 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 10 Sep 2024 23:32:41 -0700 Subject: [PATCH 33/47] address comments Signed-off-by: Kaushal Kumar --- server/src/main/java/org/opensearch/wlm/QueryGroupTask.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index 842a320a65075..a1cb766579d43 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -32,7 +32,7 @@ public class QueryGroupTask extends CancellableTask { private static final Logger logger = LogManager.getLogger(QueryGroupTask.class); public static final String QUERY_GROUP_ID_HEADER = "queryGroupId"; public static final Supplier DEFAULT_QUERY_GROUP_ID_SUPPLIER = () -> "DEFAULT_QUERY_GROUP"; - private LongSupplier nanoTimeSupplier; + private final LongSupplier nanoTimeSupplier; private String queryGroupId; public QueryGroupTask(long id, String type, String action, String description, TaskId parentTaskId, Map headers) { @@ -48,7 +48,7 @@ public QueryGroupTask( Map headers, TimeValue cancelAfterTimeInterval ) { - super(id, type, action, description, parentTaskId, headers, cancelAfterTimeInterval); + this(id, type, action, description, parentTaskId, headers, cancelAfterTimeInterval, System::nanoTime); } public QueryGroupTask( From bec1eced83bea61a5d662830d10d600e41c927c9 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Wed, 11 Sep 2024 12:43:01 -0700 Subject: [PATCH 34/47] temp changes Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 6 ++-- .../org/opensearch/wlm/QueryGroupService.java | 8 +++--- .../QueryGroupTaskCancellationService.java | 13 ++++++--- ...adManagementTransportInterceptorTests.java | 6 ++-- ...eryGroupRequestOperationListenerTests.java | 28 ++++++++++++------- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 39153573d24c4..f24cd51537858 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -271,6 +271,7 @@ import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.cancellation.MaximumResourceTaskSelectionStrategy; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.WorkloadManagementTransportInterceptor; @@ -1030,8 +1031,7 @@ protected Node( identityService.initializeIdentityAwarePlugins(identityAwarePlugins); QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( - taskResourceTrackingService, - System::nanoTime + taskResourceTrackingService ); WorkloadManagementSettings workloadManagementSettings = new WorkloadManagementSettings( settings, @@ -1039,7 +1039,7 @@ protected Node( ); final QueryGroupService queryGroupService = new QueryGroupService( - new org.opensearch.wlm.cancellation.TaskCancellationService(workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), queryGroupResourceUsageTrackerService), + new QueryGroupTaskCancellationService(workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), queryGroupResourceUsageTrackerService), clusterService, threadPool, workloadManagementSettings diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index a4b385a66010d..8bcb6b372d07f 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -22,7 +22,7 @@ import org.opensearch.search.backpressure.trackers.NodeDuressTrackers.NodeDuressTracker; import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; -import org.opensearch.wlm.cancellation.TaskCancellationService; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; import org.opensearch.wlm.stats.QueryGroupStats.QueryGroupStatsHolder; @@ -42,7 +42,7 @@ public class QueryGroupService extends AbstractLifecycleComponent implements Clu private final Map queryGroupStateMap; private static final Logger logger = LogManager.getLogger(QueryGroupService.class); - private final TaskCancellationService taskCancellationService; + private final QueryGroupTaskCancellationService taskCancellationService; private volatile Scheduler.Cancellable scheduledFuture; private final ThreadPool threadPool; private final ClusterService clusterService; @@ -52,7 +52,7 @@ public class QueryGroupService extends AbstractLifecycleComponent implements Clu private final NodeDuressTrackers nodeDuressTrackers; public QueryGroupService( - TaskCancellationService taskCancellationService, + QueryGroupTaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings) { @@ -73,7 +73,7 @@ ResourceType.MEMORY, new NodeDuressTracker( } public QueryGroupService( - TaskCancellationService taskCancellationService, + QueryGroupTaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings, diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index a2c97c8d8635b..06decb859b8cd 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -18,10 +18,7 @@ import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.function.BooleanSupplier; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -56,6 +53,14 @@ public class QueryGroupTaskCancellationService { private final Collection activeQueryGroups; private final Collection deletedQueryGroups; + public QueryGroupTaskCancellationService( + WorkloadManagementSettings workloadManagementSettings, + TaskSelectionStrategy taskSelectionStrategy, + QueryGroupResourceUsageTrackerService resourceUsageTrackerService + ) { + this(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, new HashSet<>(), new HashSet<>()); + } + public QueryGroupTaskCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, diff --git a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java index 3a16c4e9a9691..4159ab9a17962 100644 --- a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java +++ b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java @@ -17,7 +17,7 @@ import org.opensearch.transport.TransportRequest; import org.opensearch.transport.TransportRequestHandler; import org.opensearch.wlm.WorkloadManagementTransportInterceptor.RequestHandler; -import org.opensearch.wlm.cancellation.TaskCancellationService; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; import java.util.Collections; @@ -26,7 +26,7 @@ import static org.opensearch.threadpool.ThreadPool.Names.SAME; public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestCase { - private TaskCancellationService mockTaskCancellationService; + private QueryGroupTaskCancellationService mockTaskCancellationService; private ClusterService mockClusterService; private ThreadPool mockThreadPool; private WorkloadManagementSettings mockWorkloadManagementSettings; @@ -35,7 +35,7 @@ public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestC public void setUp() throws Exception { super.setUp(); - mockTaskCancellationService = mock(TaskCancellationService.class); + mockTaskCancellationService = mock(QueryGroupTaskCancellationService.class); mockClusterService = mock(ClusterService.class); mockThreadPool = mock(ThreadPool.class); mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 175ec88a6733d..74e6a707508ec 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -14,29 +14,33 @@ import org.opensearch.common.util.concurrent.ThreadContext; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.test.OpenSearchTestCase; -import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; -import org.opensearch.wlm.cancellation.TaskCancellationService; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.stats.QueryGroupStats; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import java.io.IOException; -import java.util.*; - -import static org.mockito.Mockito.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class QueryGroupRequestOperationListenerTests extends OpenSearchTestCase { public static final int ITERATIONS = 20; ThreadPool testThreadPool; QueryGroupService queryGroupService; - private TaskCancellationService taskCancellationService; + private QueryGroupTaskCancellationService taskCancellationService; private ClusterService mockClusterService; private WorkloadManagementSettings mockWorkloadManagementSettings; Map queryGroupStateMap; @@ -45,7 +49,7 @@ public class QueryGroupRequestOperationListenerTests extends OpenSearchTestCase public void setUp() throws Exception { super.setUp(); - taskCancellationService = mock(TaskCancellationService.class); + taskCancellationService = mock(QueryGroupTaskCancellationService.class); mockClusterService = mock(ClusterService.class); mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); queryGroupStateMap = new HashMap<>(); @@ -108,7 +112,9 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { testThreadPool, mockWorkloadManagementSettings, null, - queryGroupStateMap + queryGroupStateMap, + null, + null ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); @@ -197,7 +203,9 @@ private void assertSuccess( testThreadPool, mockWorkloadManagementSettings, null, - queryGroupStateMap + queryGroupStateMap, + null, + null ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); From 1e76edeb361a214fd1bad58168133c10b1a959c6 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 12 Sep 2024 18:14:33 -0700 Subject: [PATCH 35/47] add UTs Signed-off-by: Kaushal Kumar --- .../common/settings/ClusterSettings.java | 5 +- .../main/java/org/opensearch/node/Node.java | 12 +- .../org/opensearch/wlm/QueryGroupService.java | 102 ++++-- .../main/java/org/opensearch/wlm/WlmMode.java | 3 +- .../wlm/WorkloadManagementSettings.java | 4 +- .../QueryGroupTaskCancellationService.java | 63 +++- .../opensearch/wlm/stats/QueryGroupState.java | 12 +- ...QueryGroupResourceUsageTrackerService.java | 2 + .../wlm/QueryGroupServiceTests.java | 331 ++++++++++++++++++ ...adManagementTransportInterceptorTests.java | 12 +- ...ueryGroupTaskCancellationServiceTests.java | 7 +- ...eryGroupRequestOperationListenerTests.java | 47 ++- 12 files changed, 538 insertions(+), 62 deletions(-) create mode 100644 server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index 09832e2b41b6d..879798a6558d4 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -793,7 +793,10 @@ public void apply(Settings value, Settings current, Settings previous) { WorkloadManagementSettings.NODE_LEVEL_CPU_REJECTION_THRESHOLD, WorkloadManagementSettings.NODE_LEVEL_CPU_CANCELLATION_THRESHOLD, WorkloadManagementSettings.NODE_LEVEL_MEMORY_REJECTION_THRESHOLD, - WorkloadManagementSettings.NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD + WorkloadManagementSettings.NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD, + WorkloadManagementSettings.WLM_MODE_SETTING, + WorkloadManagementSettings.QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING, + WorkloadManagementSettings.QUERYGROUP_SERVICE_DURESS_STREAK_SETTING ) ) ); diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index f24cd51537858..aff9a6a3429a4 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -270,12 +270,12 @@ import org.opensearch.usage.UsageService; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.QueryGroupService; -import org.opensearch.wlm.cancellation.MaximumResourceTaskSelectionStrategy; -import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; -import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.WorkloadManagementTransportInterceptor; +import org.opensearch.wlm.cancellation.MaximumResourceTaskSelectionStrategy; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; import org.opensearch.wlm.listeners.QueryGroupRequestOperationListener; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import javax.net.ssl.SNIHostName; @@ -1039,7 +1039,11 @@ protected Node( ); final QueryGroupService queryGroupService = new QueryGroupService( - new QueryGroupTaskCancellationService(workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), queryGroupResourceUsageTrackerService), + new QueryGroupTaskCancellationService( + workloadManagementSettings, + new MaximumResourceTaskSelectionStrategy(), + queryGroupResourceUsageTrackerService + ), clusterService, threadPool, workloadManagementSettings diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 8bcb6b372d07f..2b99c3eda0242 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -7,6 +7,7 @@ */ package org.opensearch.wlm; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.cluster.ClusterChangedEvent; @@ -33,6 +34,8 @@ import java.util.Map; import java.util.Set; +import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; + /** * As of now this is a stub and main implementation PR will be raised soon.Coming PR will collate these changes with core QueryGroupService changes */ @@ -55,16 +58,30 @@ public QueryGroupService( QueryGroupTaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, - WorkloadManagementSettings workloadManagementSettings) { + WorkloadManagementSettings workloadManagementSettings + ) { - this(taskCancellationService, clusterService, threadPool, workloadManagementSettings, + this( + taskCancellationService, + clusterService, + threadPool, + workloadManagementSettings, new NodeDuressTrackers( - Map.of(ResourceType.CPU, new NodeDuressTracker(() -> - workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance().getProcessCpuPercent() / 100.0, - workloadManagementSettings::getDuressStreak), - ResourceType.MEMORY, new NodeDuressTracker( - () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats().getMem().getHeapUsedPercent() / 100.0, - workloadManagementSettings::getDuressStreak)) + Map.of( + ResourceType.CPU, + new NodeDuressTracker( + () -> workloadManagementSettings.getNodeLevelCpuCancellationThreshold() < ProcessProbe.getInstance() + .getProcessCpuPercent() / 100.0, + workloadManagementSettings::getDuressStreak + ), + ResourceType.MEMORY, + new NodeDuressTracker( + () -> workloadManagementSettings.getNodeLevelMemoryCancellationThreshold() <= JvmStats.jvmStats() + .getMem() + .getHeapUsedPercent() / 100.0, + workloadManagementSettings::getDuressStreak + ) + ) ), new HashMap<>(), new HashSet<>(clusterService.state().metadata().queryGroups().values()), @@ -78,7 +95,7 @@ public QueryGroupService( ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings, NodeDuressTrackers nodeDuressTrackers, - Map queryGroupStateMap, + Map stateMap, Set activeQueryGroups, Set deletedQueryGroups ) { @@ -89,8 +106,10 @@ public QueryGroupService( this.nodeDuressTrackers = nodeDuressTrackers; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; - - this.queryGroupStateMap = queryGroupStateMap; + activeQueryGroups.forEach(queryGroup -> stateMap.putIfAbsent(queryGroup.get_id(), new QueryGroupState())); + this.queryGroupStateMap = stateMap; + this.queryGroupStateMap.put(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), new QueryGroupState()); + taskCancellationService.setQueryGroupStateMapAccessor(this::getQueryGroupState); } /** @@ -100,10 +119,14 @@ protected void doRun() { if (workloadManagementSettings.getWlmMode() == WlmMode.DISABLED) { return; } -// taskCancellationService.cancelTasks(activeQueryGroups, deletedQueryGroups); + taskCancellationService.refreshQueryGroups(activeQueryGroups, deletedQueryGroups); taskCancellationService.cancelTasks(nodeDuressTrackers::isNodeInDuress); } + private QueryGroupState getQueryGroupState(final String queryGroupId) { + return queryGroupStateMap.get(queryGroupId); + } + /** * {@link AbstractLifecycleComponent} lifecycle method */ @@ -128,11 +151,6 @@ protected void doStop() { @Override protected void doClose() throws IOException {} - protected Set getActiveQueryGroupsFromClusterState() { - Map queryGroups = clusterService.state().metadata().queryGroups(); - return new HashSet<>(queryGroups.values()); - } - @Override public void applyClusterState(ClusterChangedEvent event) { // Retrieve the current and previous cluster states @@ -150,6 +168,7 @@ public void applyClusterState(ClusterChangedEvent event) { QueryGroup newQueryGroup = currentQueryGroups.get(queryGroupName); // Perform any necessary actions with the new query group this.activeQueryGroups.add(newQueryGroup); + queryGroupStateMap.put(newQueryGroup.get_id(), new QueryGroupState()); } } @@ -160,9 +179,10 @@ public void applyClusterState(ClusterChangedEvent event) { QueryGroup deletedQueryGroup = previousQueryGroups.get(queryGroupName); // Perform any necessary actions with the deleted query group this.deletedQueryGroups.add(deletedQueryGroup); + queryGroupStateMap.remove(deletedQueryGroup.get_id()); } } - } // tested + } /** * updates the failure stats for the query group @@ -199,13 +219,51 @@ public QueryGroupStats nodeStats() { */ public void rejectIfNeeded(String queryGroupId) { if (queryGroupId == null) return; + QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); + + // This can happen if the request failed for a deleted query group + // or new queryGroup is being created and has not been acknowledged yet + if (queryGroupState == null) { + return; + } + boolean reject = false; final StringBuilder reason = new StringBuilder(); - // TODO: At this point this is dummy and we need to decide whether to cancel the request based on last - // reported resource usage for the queryGroup. We also need to increment the rejection count here for the - // query group + + // rejections will not happen for SOFT mode QueryGroups + QueryGroup queryGroup = activeQueryGroups.stream().filter(x -> x.get_id().equals(queryGroupId)).findFirst().get(); + + if (queryGroup.getResiliencyMode() == MutableQueryGroupFragment.ResiliencyMode.SOFT) return; + + for (ResourceType resourceType : TRACKED_RESOURCES) { + if (queryGroup.getResourceLimits().containsKey(resourceType)) { + final double threshold = queryGroup.getResourceLimits().get(resourceType); + final double lastRecordedUsage = queryGroupState.getResourceState().get(resourceType).getLastRecordedUsage(); + if (threshold < lastRecordedUsage) { + reject = true; + reason.append(resourceType) + .append(" limit is breaching for ENFORCED type QueryGroup: (") + .append(threshold) + .append(" < ") + .append(lastRecordedUsage) + .append("). "); + queryGroupState.getResourceState().get(resourceType).rejections.inc(); + // should not double count even if both the resource limits are breaching + break; + } + } + } if (reject) { - throw new OpenSearchRejectedExecutionException("QueryGroup " + queryGroupId + " is already contended." + reason.toString()); + queryGroupState.totalRejections.inc(); + throw new OpenSearchRejectedExecutionException("QueryGroup " + queryGroupId + " is already contended. " + reason.toString()); } } + + public Set getActiveQueryGroups() { + return activeQueryGroups; + } + + public Set getDeletedQueryGroups() { + return deletedQueryGroups; + } } diff --git a/server/src/main/java/org/opensearch/wlm/WlmMode.java b/server/src/main/java/org/opensearch/wlm/WlmMode.java index 121686458692b..40407525cc24d 100644 --- a/server/src/main/java/org/opensearch/wlm/WlmMode.java +++ b/server/src/main/java/org/opensearch/wlm/WlmMode.java @@ -20,6 +20,7 @@ public enum WlmMode { DISABLED("disabled"); private final String name; + WlmMode(String name) { this.name = name; } @@ -29,7 +30,7 @@ public String getName() { } public static WlmMode fromName(String name) { - for (WlmMode wlmMode: values()) { + for (WlmMode wlmMode : values()) { if (wlmMode.getName().equals(name)) { return wlmMode; } diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index 317e8defd8a4f..f59619f43e118 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -35,7 +35,6 @@ public class WorkloadManagementSettings { private Double nodeLevelCpuCancellationThreshold; private Double nodeLevelCpuRejectionThreshold; - /** * Setting name for QueryGroupService node duress streak */ @@ -45,6 +44,7 @@ public class WorkloadManagementSettings { QUERYGROUP_SERVICE_DURESS_STREAK_SETTING_NAME, 3, 3, + Setting.Property.Dynamic, Setting.Property.NodeScope ); @@ -61,6 +61,7 @@ public class WorkloadManagementSettings { QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING_NAME, DEFAULT_QUERYGROUP_SERVICE_RUN_INTERVAL_MILLIS, 1000, + Setting.Property.Dynamic, Setting.Property.NodeScope ); @@ -171,7 +172,6 @@ public WorkloadManagementSettings(Settings settings, ClusterSettings clusterSett clusterSettings.addSettingsUpdateConsumer(QUERYGROUP_SERVICE_DURESS_STREAK_SETTING, this::setDuressStreak); } - /** * node duress streak getter * @return current duressStreak value diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index 06decb859b8cd..6237251044464 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -8,19 +8,26 @@ package org.opensearch.wlm.cancellation; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.cluster.metadata.QueryGroup; -import org.opensearch.tasks.CancellableTask; import org.opensearch.tasks.TaskCancellation; import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; import java.util.function.BooleanSupplier; import java.util.function.Consumer; +import java.util.function.Function; import java.util.stream.Collectors; import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; @@ -44,14 +51,16 @@ */ public class QueryGroupTaskCancellationService { public static final double MIN_VALUE = 1e-9; + private static final Logger log = LogManager.getLogger(QueryGroupTaskCancellationService.class); private final WorkloadManagementSettings workloadManagementSettings; private final TaskSelectionStrategy taskSelectionStrategy; private final QueryGroupResourceUsageTrackerService resourceUsageTrackerService; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object Map queryGroupLevelResourceUsageViews; - private final Collection activeQueryGroups; - private final Collection deletedQueryGroups; + private Collection activeQueryGroups; + private Collection deletedQueryGroups; + private Function queryGroupStateAccessor; public QueryGroupTaskCancellationService( WorkloadManagementSettings workloadManagementSettings, @@ -75,15 +84,32 @@ public QueryGroupTaskCancellationService( this.deletedQueryGroups = deletedQueryGroups; } + public void setQueryGroupStateMapAccessor(final Function queryGroupStateAccessor) { + this.queryGroupStateAccessor = queryGroupStateAccessor; + } + /** * Cancel tasks based on the implemented strategy. */ - public final void cancelTasks(BooleanSupplier isNodeInDuress) { + public void cancelTasks(BooleanSupplier isNodeInDuress) { queryGroupLevelResourceUsageViews = resourceUsageTrackerService.constructQueryGroupLevelUsageViews(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits cancelTasks(ResiliencyMode.ENFORCED); // if the node is in duress, cancel tasks accordingly. handleNodeDuress(isNodeInDuress); + + updateResourceUsageInQueryGroupState(); + } + + private void updateResourceUsageInQueryGroupState() { + for (Map.Entry queryGroupLevelResourceUsageViewEntry : queryGroupLevelResourceUsageViews + .entrySet()) { + QueryGroupState queryGroupState = getQueryGroupState(queryGroupLevelResourceUsageViewEntry.getKey()); + TRACKED_RESOURCES.forEach(resourceType -> { + final double currentUsage = queryGroupLevelResourceUsageViewEntry.getValue().getResourceUsageData().get(resourceType); + queryGroupState.getResourceState().get(resourceType).setLastRecordedUsage(currentUsage); + }); + } } private void handleNodeDuress(BooleanSupplier isNodeInDuress) { @@ -123,6 +149,7 @@ List getAllCancellableTasks(ResiliencyMode resiliencyMode) { */ List getAllCancellableTasks(Collection queryGroups) { List taskCancellations = new ArrayList<>(); + final List onCancelCallbacks = new ArrayList<>(); for (QueryGroup queryGroup : queryGroups) { final List reasons = new ArrayList<>(); List selectedTasks = new ArrayList<>(); @@ -132,8 +159,9 @@ List getAllCancellableTasks(Collection queryGroups .calculateResourceUsage(selectedTasks); if (excessUsage > MIN_VALUE) { reasons.add(new TaskCancellation.Reason(generateReasonString(queryGroup, resourceType), 1)); - // TODO: We will need to add the cancellation callback for these resources for the queryGroup to reflect stats + // TODO: We will need to add the cancellation callback for these resources for the queryGroup to reflect stats + onCancelCallbacks.add(this.getResourceTypeOnCancelCallback(queryGroup.get_id(), resourceType)); // Only add tasks not already added to avoid double cancellations selectedTasks.addAll( taskSelectionStrategy.selectTasksForCancellation(getTasksFor(queryGroup), excessUsage, resourceType) @@ -145,8 +173,9 @@ List getAllCancellableTasks(Collection queryGroups } if (!reasons.isEmpty()) { + onCancelCallbacks.add(getQueryGroupState(queryGroup.get_id()).totalCancellations::inc); taskCancellations.addAll( - selectedTasks.stream().map(task -> createTaskCancellation(task, reasons)).collect(Collectors.toList()) + selectedTasks.stream().map(task -> new TaskCancellation(task, reasons, onCancelCallbacks)).collect(Collectors.toList()) ); } } @@ -177,10 +206,6 @@ private void cancelTasks(List cancellableTasks) { cancellableTasks.forEach(TaskCancellation::cancel); } - private TaskCancellation createTaskCancellation(CancellableTask task, List reasons) { - return new TaskCancellation(task, reasons, List.of(this::callbackOnCancel)); - } - private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) { if (queryGroup.getResourceLimits().get(resourceType) == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) { @@ -204,7 +229,19 @@ private double getNormalisedThreshold(QueryGroup queryGroup, ResourceType resour return queryGroup.getResourceLimits().get(resourceType) * nodeLevelCancellationThreshold; } - private void callbackOnCancel() { - // TODO Implement callback logic here mostly used for Stats + private Runnable getResourceTypeOnCancelCallback(String queryGroupId, ResourceType resourceType) { + QueryGroupState queryGroupState = getQueryGroupState(queryGroupId); + return queryGroupState.getResourceState().get(resourceType).cancellations::inc; + } + + private QueryGroupState getQueryGroupState(String queryGroupId) { + assert queryGroupId != null : "queryGroupId should never be null at this point."; + + return queryGroupStateAccessor.apply(queryGroupId); + } + + public void refreshQueryGroups(Collection activeQueryGroups, Collection deletedQueryGroups) { + this.activeQueryGroups = activeQueryGroups; + this.deletedQueryGroups = deletedQueryGroups; } } diff --git a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java index 376d34dd7c8ca..b1516fadbcd7f 100644 --- a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java +++ b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java @@ -21,12 +21,12 @@ public class QueryGroupState { /** * completions at the query group level, this is a cumulative counter since the Opensearch start time */ - final CounterMetric completions = new CounterMetric(); + public final CounterMetric completions = new CounterMetric(); /** * rejections at the query group level, this is a cumulative counter since the OpenSearch start time */ - final CounterMetric totalRejections = new CounterMetric(); + public final CounterMetric totalRejections = new CounterMetric(); /** * this will track the cumulative failures in a query group @@ -36,7 +36,7 @@ public class QueryGroupState { /** * This will track total number of cancellations in the query group due to all resource type breaches */ - final CounterMetric totalCancellations = new CounterMetric(); + public final CounterMetric totalCancellations = new CounterMetric(); /** * This is used to store the resource type state both for CPU and MEMORY @@ -92,9 +92,9 @@ public Map getResourceState() { * This class holds the resource level stats for the query group */ public static class ResourceTypeState { - final ResourceType resourceType; - final CounterMetric cancellations = new CounterMetric(); - final CounterMetric rejections = new CounterMetric(); + public final ResourceType resourceType; + public final CounterMetric cancellations = new CounterMetric(); + public final CounterMetric rejections = new CounterMetric(); private double lastRecordedUsage = 0; public ResourceTypeState(ResourceType resourceType) { diff --git a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java index b23d9ff342139..19f7bf48d8421 100644 --- a/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java +++ b/server/src/main/java/org/opensearch/wlm/tracker/QueryGroupResourceUsageTrackerService.java @@ -47,6 +47,8 @@ public Map constructQueryGroupLevelUsa // Iterate over each QueryGroup entry for (Map.Entry> queryGroupEntry : tasksByQueryGroup.entrySet()) { + // refresh the resource stats + taskResourceTrackingService.refreshResourceStats(queryGroupEntry.getValue().toArray(new QueryGroupTask[0])); // Compute the QueryGroup resource usage final Map queryGroupUsage = new EnumMap<>(ResourceType.class); for (ResourceType resourceType : TRACKED_RESOURCES) { diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java new file mode 100644 index 0000000000000..2d5bb90bc0859 --- /dev/null +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -0,0 +1,331 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm; + +import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; +import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.threadpool.Scheduler; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; +import org.opensearch.wlm.cancellation.TaskSelectionStrategy; +import org.opensearch.wlm.stats.QueryGroupState; +import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; +import org.junit.Before; + +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.function.BooleanSupplier; + +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class QueryGroupServiceTests extends OpenSearchTestCase { + private QueryGroupService queryGroupService; + private QueryGroupTaskCancellationService mockCancellationService; + private ClusterService mockClusterService; + private ThreadPool mockThreadPool; + private WorkloadManagementSettings mockWorkloadManagementSettings; + private Scheduler.Cancellable mockScheduledFuture; + private Map mockQueryGroupStateMap; + NodeDuressTrackers mockNodeDuressTrackers; + + @Before + public void setup() { + mockClusterService = Mockito.mock(ClusterService.class); + mockThreadPool = Mockito.mock(ThreadPool.class); + mockScheduledFuture = Mockito.mock(Scheduler.Cancellable.class); + mockWorkloadManagementSettings = Mockito.mock(WorkloadManagementSettings.class); + mockQueryGroupStateMap = new HashMap<>(); + mockNodeDuressTrackers = Mockito.mock(NodeDuressTrackers.class); + mockCancellationService = Mockito.mock(TestQueryGroupCancellationService.class); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + mockQueryGroupStateMap, + new HashSet<>(), + new HashSet<>() + ); + } + + public void testApplyClusterState() { + ClusterChangedEvent mockClusterChangedEvent = Mockito.mock(ClusterChangedEvent.class); + ClusterState mockPreviousClusterState = Mockito.mock(ClusterState.class); + ClusterState mockClusterState = Mockito.mock(ClusterState.class); + Metadata mockPreviousMetadata = Mockito.mock(Metadata.class); + Metadata mockMetadata = Mockito.mock(Metadata.class); + QueryGroup addedQueryGroup = new QueryGroup( + "addedQueryGroup", + "4242", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.5)), + 1L + ); + QueryGroup deletedQueryGroup = new QueryGroup( + "deletedQueryGroup", + "4241", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.5)), + 1L + ); + Map previousQueryGroups = new HashMap<>(); + previousQueryGroups.put("4242", addedQueryGroup); + Map currentQueryGroups = new HashMap<>(); + currentQueryGroups.put("4241", deletedQueryGroup); + + when(mockClusterChangedEvent.previousState()).thenReturn(mockPreviousClusterState); + when(mockClusterChangedEvent.state()).thenReturn(mockClusterState); + when(mockPreviousClusterState.metadata()).thenReturn(mockPreviousMetadata); + when(mockClusterState.metadata()).thenReturn(mockMetadata); + when(mockPreviousMetadata.queryGroups()).thenReturn(previousQueryGroups); + when(mockMetadata.queryGroups()).thenReturn(currentQueryGroups); + queryGroupService.applyClusterState(mockClusterChangedEvent); + + Set currentQueryGroupsExpected = Set.of(currentQueryGroups.get("4241")); + Set previousQueryGroupsExpected = Set.of(previousQueryGroups.get("4242")); + + assertEquals(currentQueryGroupsExpected, queryGroupService.getActiveQueryGroups()); + assertEquals(previousQueryGroupsExpected, queryGroupService.getDeletedQueryGroups()); + } + + public void testDoStart_SchedulesTask() { + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); + when(mockWorkloadManagementSettings.getQueryGroupServiceRunInterval()).thenReturn(TimeValue.timeValueSeconds(1)); + queryGroupService.doStart(); + Mockito.verify(mockThreadPool).scheduleWithFixedDelay(any(Runnable.class), any(TimeValue.class), eq(ThreadPool.Names.GENERIC)); + } + + public void testDoStop_CancelsScheduledTask() { + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); + when(mockThreadPool.scheduleWithFixedDelay(any(), any(), any())).thenReturn(mockScheduledFuture); + queryGroupService.doStart(); + queryGroupService.doStop(); + Mockito.verify(mockScheduledFuture).cancel(); + } + + public void testDoRun_WhenModeEnabled() { + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); + when(mockNodeDuressTrackers.isNodeInDuress()).thenReturn(true); + doNothing().when(mockCancellationService).refreshQueryGroups(any(), any()); + // Call the method + queryGroupService.doRun(); + + // Verify that refreshQueryGroups was called + Mockito.verify(mockCancellationService).refreshQueryGroups(any(), any()); + + // Verify that cancelTasks was called with a BooleanSupplier + ArgumentCaptor booleanSupplierCaptor = ArgumentCaptor.forClass(BooleanSupplier.class); + Mockito.verify(mockCancellationService).cancelTasks(booleanSupplierCaptor.capture()); + + // Assert the behavior of the BooleanSupplier + BooleanSupplier capturedSupplier = booleanSupplierCaptor.getValue(); + assertTrue(capturedSupplier.getAsBoolean()); + + } + + public void testDoRun_WhenModeDisabled() { + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.DISABLED); + when(mockNodeDuressTrackers.isNodeInDuress()).thenReturn(false); + queryGroupService.doRun(); + // Verify that refreshQueryGroups was called + Mockito.verify(mockCancellationService, never()).refreshQueryGroups(any(), any()); + + Mockito.verify(mockCancellationService, never()).cancelTasks(any()); + + } + + public void testRejectIfNeeded_whenQueryGroupIdIsNull() { + QueryGroup testQueryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.10)), + 1L + ); + Set activeQueryGroups = new HashSet<>() { + { + add(testQueryGroup); + } + }; + mockQueryGroupStateMap = new HashMap<>(); + mockQueryGroupStateMap.put("queryGroupId1", new QueryGroupState()); + + Map spyMap = spy(mockQueryGroupStateMap); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + spyMap, + activeQueryGroups, + new HashSet<>() + ); + queryGroupService.rejectIfNeeded(null); + + verify(spyMap, never()).get(any()); + } + + public void testRejectIfNeeded_whenQueryGroupIsSoftMode() { + QueryGroup testQueryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.SOFT, Map.of(ResourceType.CPU, 0.10)), + 1L + ); + Set activeQueryGroups = new HashSet<>() { + { + add(testQueryGroup); + } + }; + mockQueryGroupStateMap = new HashMap<>(); + QueryGroupState spyState = spy(new QueryGroupState()); + mockQueryGroupStateMap.put("queryGroupId1", spyState); + + Map spyMap = spy(mockQueryGroupStateMap); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + spyMap, + activeQueryGroups, + new HashSet<>() + ); + queryGroupService.rejectIfNeeded("queryGroupId1"); + + verify(spyState, never()).getResourceState(); + } + + public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andNotBreaching() { + QueryGroup testQueryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.10)), + 1L + ); + QueryGroup spuQueryGroup = spy(testQueryGroup); + Set activeQueryGroups = new HashSet<>() { + { + add(spuQueryGroup); + } + }; + mockQueryGroupStateMap = new HashMap<>(); + QueryGroupState queryGroupState = new QueryGroupState(); + queryGroupState.getResourceState().get(ResourceType.CPU).setLastRecordedUsage(0.08); + + mockQueryGroupStateMap.put("queryGroupId1", queryGroupState); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + mockQueryGroupStateMap, + activeQueryGroups, + new HashSet<>() + ); + queryGroupService.rejectIfNeeded("queryGroupId1"); + + // verify the check to compare the current usage and limit + // this should happen 3 times => 2 to check whether the resource limit has the TRACKED resource type and 1 to get the value + verify(spuQueryGroup, times(3)).getResourceLimits(); + assertEquals(0, queryGroupState.getResourceState().get(ResourceType.CPU).rejections.count()); + assertEquals(0, queryGroupState.totalRejections.count()); + } + + public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { + QueryGroup testQueryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.10, ResourceType.MEMORY, 0.10) + ), + 1L + ); + QueryGroup spuQueryGroup = spy(testQueryGroup); + Set activeQueryGroups = new HashSet<>() { + { + add(spuQueryGroup); + } + }; + mockQueryGroupStateMap = new HashMap<>(); + QueryGroupState queryGroupState = new QueryGroupState(); + queryGroupState.getResourceState().get(ResourceType.CPU).setLastRecordedUsage(0.18); + queryGroupState.getResourceState().get(ResourceType.MEMORY).setLastRecordedUsage(0.18); + QueryGroupState spyState = spy(queryGroupState); + + mockQueryGroupStateMap.put("queryGroupId1", spyState); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + mockQueryGroupStateMap, + activeQueryGroups, + new HashSet<>() + ); + assertThrows(OpenSearchRejectedExecutionException.class, () -> queryGroupService.rejectIfNeeded("queryGroupId1")); + + // verify the check to compare the current usage and limit + // this should happen 3 times => 1 to check whether the resource limit has the TRACKED resource type and 1 to get the value + // because it will break out of the loop since the limits are breached + verify(spuQueryGroup, times(2)).getResourceLimits(); + assertEquals( + 1, + queryGroupState.getResourceState().get(ResourceType.CPU).rejections.count() + queryGroupState.getResourceState() + .get(ResourceType.MEMORY).rejections.count() + ); + assertEquals(1, queryGroupState.totalRejections.count()); + } + + // This is needed to test the behavior of QueryGroupService#doRun method + static class TestQueryGroupCancellationService extends QueryGroupTaskCancellationService { + public TestQueryGroupCancellationService( + WorkloadManagementSettings workloadManagementSettings, + TaskSelectionStrategy taskSelectionStrategy, + QueryGroupResourceUsageTrackerService resourceUsageTrackerService, + Collection activeQueryGroups, + Collection deletedQueryGroups + ) { + super(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups); + } + + @Override + public void cancelTasks(BooleanSupplier isNodeInDuress) { + + } + } +} diff --git a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java index 4159ab9a17962..4d539fa708c42 100644 --- a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java +++ b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java @@ -21,9 +21,9 @@ import java.util.Collections; +import static org.opensearch.threadpool.ThreadPool.Names.SAME; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import static org.opensearch.threadpool.ThreadPool.Names.SAME; public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestCase { private QueryGroupTaskCancellationService mockTaskCancellationService; @@ -45,13 +45,9 @@ public void setUp() throws Exception { when(mockClusterService.state()).thenReturn(state); when(state.metadata()).thenReturn(metadata); when(metadata.queryGroups()).thenReturn(Collections.emptyMap()); - sut = new WorkloadManagementTransportInterceptor(threadPool, - new QueryGroupService( - mockTaskCancellationService, - mockClusterService, - mockThreadPool, - mockWorkloadManagementSettings - ) + sut = new WorkloadManagementTransportInterceptor( + threadPool, + new QueryGroupService(mockTaskCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings) ); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index f7a49235efc69..05d5de7dfb762 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -19,6 +19,7 @@ import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; +import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import org.junit.Before; @@ -66,6 +67,7 @@ public void setup() { activeQueryGroups, deletedQueryGroups ); + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); } public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { @@ -222,6 +224,7 @@ public void testCancelTasks_cancelsGivenTasks() { activeQueryGroups, deletedQueryGroups ); + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -284,7 +287,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { activeQueryGroups, deletedQueryGroups ); - + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -355,6 +358,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN activeQueryGroups, deletedQueryGroups ); + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); @@ -414,6 +418,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { activeQueryGroups, deletedQueryGroups ); + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 74e6a707508ec..81c64e5368186 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -95,6 +95,19 @@ public void testValidQueryGroupRequestFailure() throws IOException { ResourceType.MEMORY, new QueryGroupStats.ResourceStats(0, 0, 0) ) + ), + QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), + new QueryGroupStats.QueryGroupStatsHolder( + 0, + 0, + 0, + 0, + Map.of( + ResourceType.CPU, + new QueryGroupStats.ResourceStats(0, 0, 0), + ResourceType.MEMORY, + new QueryGroupStats.ResourceStats(0, 0, 0) + ) ) ) ); @@ -113,8 +126,8 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { mockWorkloadManagementSettings, null, queryGroupStateMap, - null, - null + Collections.emptySet(), + Collections.emptySet() ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); @@ -154,6 +167,19 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { ResourceType.MEMORY, new QueryGroupStats.ResourceStats(0, 0, 0) ) + ), + QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), + new QueryGroupStats.QueryGroupStatsHolder( + 0, + 0, + 0, + 0, + Map.of( + ResourceType.CPU, + new QueryGroupStats.ResourceStats(0, 0, 0), + ResourceType.MEMORY, + new QueryGroupStats.ResourceStats(0, 0, 0) + ) ) ) ); @@ -165,6 +191,19 @@ public void testInvalidQueryGroupFailure() throws IOException { QueryGroupStats expectedStats = new QueryGroupStats( Map.of( testQueryGroupId, + new QueryGroupStats.QueryGroupStatsHolder( + 0, + 0, + 0, + 0, + Map.of( + ResourceType.CPU, + new QueryGroupStats.ResourceStats(0, 0, 0), + ResourceType.MEMORY, + new QueryGroupStats.ResourceStats(0, 0, 0) + ) + ), + QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), new QueryGroupStats.QueryGroupStatsHolder( 0, 0, @@ -204,8 +243,8 @@ private void assertSuccess( mockWorkloadManagementSettings, null, queryGroupStateMap, - null, - null + Collections.emptySet(), + Collections.emptySet() ); sut = new QueryGroupRequestOperationListener(queryGroupService, testThreadPool); From 931f6bf307696960d1a7f1bc770dada88b356565 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 12 Sep 2024 20:54:14 -0700 Subject: [PATCH 36/47] add changelog Signed-off-by: Kaushal Kumar --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3268852cc99f9..f20de58b841f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Added - MultiTermQueries in keyword fields now default to `indexed` approach and gated behind cluster setting ([#15637](https://github.com/opensearch-project/OpenSearch/pull/15637)) - [Workload Management] QueryGroup resource cancellation framework changes ([#15651](https://github.com/opensearch-project/OpenSearch/pull/15651)) +- [Workload Management] Add orchestrator for wlm resiliency (QueryGroupService) ([#15925](https://github.com/opensearch-project/OpenSearch/pull/15925)) - Fallback to Remote cluster-state on Term-Version check mismatch - ([#15424](https://github.com/opensearch-project/OpenSearch/pull/15424)) ### Dependencies From 3f4d59099027ad399270d0a859cf0f097f488814 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 13 Sep 2024 10:46:24 -0700 Subject: [PATCH 37/47] add task completion listener hook Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 1 + .../org/opensearch/wlm/QueryGroupService.java | 17 ++++++- .../QueryGroupTaskCancellationService.java | 2 +- .../wlm/QueryGroupServiceTests.java | 45 +++++++++++++++++-- 4 files changed, 60 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index aff9a6a3429a4..4755ff49a0ead 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -1048,6 +1048,7 @@ protected Node( threadPool, workloadManagementSettings ); + taskResourceTrackingService.addTaskCompletionListener(queryGroupService); final QueryGroupRequestOperationListener queryGroupRequestOperationListener = new QueryGroupRequestOperationListener( queryGroupService, diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 2b99c3eda0242..1fd415b44ee88 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -21,6 +21,8 @@ import org.opensearch.monitor.process.ProcessProbe; import org.opensearch.search.backpressure.trackers.NodeDuressTrackers; import org.opensearch.search.backpressure.trackers.NodeDuressTrackers.NodeDuressTracker; +import org.opensearch.tasks.Task; +import org.opensearch.tasks.TaskResourceTrackingService; import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; @@ -39,7 +41,10 @@ /** * As of now this is a stub and main implementation PR will be raised soon.Coming PR will collate these changes with core QueryGroupService changes */ -public class QueryGroupService extends AbstractLifecycleComponent implements ClusterStateApplier { +public class QueryGroupService extends AbstractLifecycleComponent + implements + ClusterStateApplier, + TaskResourceTrackingService.TaskCompletionListener { // This map does not need to be concurrent since we will process the cluster state change serially and update // this map with new additions and deletions of entries. QueryGroupState is thread safe private final Map queryGroupStateMap; @@ -266,4 +271,14 @@ public Set getActiveQueryGroups() { public Set getDeletedQueryGroups() { return deletedQueryGroups; } + + @Override + public void onTaskCompleted(Task task) { + if (!(task instanceof QueryGroupTask)) { + return; + } + final QueryGroupTask queryGroupTask = (QueryGroupTask) task; + final String queryGroupId = queryGroupTask.getQueryGroupId(); + queryGroupStateMap.get(queryGroupId).completions.inc(); + } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index d12716a5a6543..2ad0070b75701 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -91,7 +91,7 @@ public void setQueryGroupStateMapAccessor(final Function mockQueryGroupStateMap; NodeDuressTrackers mockNodeDuressTrackers; - @Before - public void setup() { + public void setUp() throws Exception { + super.setUp(); mockClusterService = Mockito.mock(ClusterService.class); mockThreadPool = Mockito.mock(ThreadPool.class); mockScheduledFuture = Mockito.mock(Scheduler.Cancellable.class); @@ -76,6 +78,11 @@ public void setup() { ); } + public void tearDown() throws Exception { + super.tearDown(); + mockThreadPool.shutdown(); + } + public void testApplyClusterState() { ClusterChangedEvent mockClusterChangedEvent = Mockito.mock(ClusterChangedEvent.class); ClusterState mockPreviousClusterState = Mockito.mock(ClusterState.class); @@ -311,6 +318,38 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { assertEquals(1, queryGroupState.totalRejections.count()); } + public void testOnTaskCompleted() { + Task task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 12); + mockThreadPool = new TestThreadPool("queryGroupServiceTests"); + mockThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, "testId"); + QueryGroupState queryGroupState = new QueryGroupState(); + mockQueryGroupStateMap.put("testId", queryGroupState); + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + mockQueryGroupStateMap, + new HashSet<>(), + new HashSet<>() + ); + + ((QueryGroupTask) task).setQueryGroupId(mockThreadPool.getThreadContext()); + queryGroupService.onTaskCompleted(task); + + assertEquals(1, queryGroupState.completions.count()); + + // test non QueryGroupTask + task = new Task(1, "simple", "test", "mock task", null, null); + queryGroupService.onTaskCompleted(task); + + // It should still be 1 + assertEquals(1, queryGroupState.completions.count()); + + mockThreadPool.shutdown(); + } + // This is needed to test the behavior of QueryGroupService#doRun method static class TestQueryGroupCancellationService extends QueryGroupTaskCancellationService { public TestQueryGroupCancellationService( From 501e5e94aec4f4745090ec25033f38a7f5bd0f57 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Fri, 13 Sep 2024 13:58:31 -0700 Subject: [PATCH 38/47] add remaining pieces to make the feature functional Signed-off-by: Kaushal Kumar --- .../org/opensearch/action/ActionModule.java | 6 +- .../main/java/org/opensearch/node/Node.java | 2 + .../org/opensearch/wlm/QueryGroupService.java | 61 +++++++++++-------- .../QueryGroupTaskCancellationService.java | 17 ++++++ .../wlm/QueryGroupServiceTests.java | 5 +- ...ueryGroupTaskCancellationServiceTests.java | 46 ++++++++++++++ 6 files changed, 108 insertions(+), 29 deletions(-) diff --git a/server/src/main/java/org/opensearch/action/ActionModule.java b/server/src/main/java/org/opensearch/action/ActionModule.java index fbf90b97d1e8f..b0c076f3fb36a 100644 --- a/server/src/main/java/org/opensearch/action/ActionModule.java +++ b/server/src/main/java/org/opensearch/action/ActionModule.java @@ -476,6 +476,7 @@ import org.opensearch.tasks.Task; import org.opensearch.threadpool.ThreadPool; import org.opensearch.usage.UsageService; +import org.opensearch.wlm.QueryGroupTask; import java.util.ArrayList; import java.util.Collections; @@ -559,7 +560,10 @@ public ActionModule( destructiveOperations = new DestructiveOperations(settings, clusterSettings); Set headers = Stream.concat( actionPlugins.stream().flatMap(p -> p.getRestHeaders().stream()), - Stream.of(new RestHeaderDefinition(Task.X_OPAQUE_ID, false)) + Stream.of( + new RestHeaderDefinition(Task.X_OPAQUE_ID, false), + new RestHeaderDefinition(QueryGroupTask.QUERY_GROUP_ID_HEADER, false) + ) ).collect(Collectors.toSet()); UnaryOperator restWrapper = null; for (ActionPlugin plugin : actionPlugins) { diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 4755ff49a0ead..b12c985787e4f 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -1422,6 +1422,7 @@ protected Node( b.bind(IndexingPressureService.class).toInstance(indexingPressureService); b.bind(TaskResourceTrackingService.class).toInstance(taskResourceTrackingService); b.bind(SearchBackpressureService.class).toInstance(searchBackpressureService); + b.bind(QueryGroupService.class).toInstance(queryGroupService); b.bind(AdmissionControlService.class).toInstance(admissionControlService); b.bind(UsageService.class).toInstance(usageService); b.bind(AggregationUsageService.class).toInstance(searchModule.getValuesSourceRegistry().getUsageService()); @@ -1615,6 +1616,7 @@ public Node start() throws NodeValidationException { nodeService.getMonitorService().start(); nodeService.getSearchBackpressureService().start(); nodeService.getTaskCancellationMonitoringService().start(); + injector.getInstance(QueryGroupService.class).start(); final ClusterService clusterService = injector.getInstance(ClusterService.class); diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 1fd415b44ee88..ee0eb40adebd2 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -34,6 +34,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Optional; import java.util.Set; import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; @@ -89,7 +90,7 @@ public QueryGroupService( ) ), new HashMap<>(), - new HashSet<>(clusterService.state().metadata().queryGroups().values()), + new HashSet<>(), new HashSet<>() ); } @@ -115,6 +116,7 @@ public QueryGroupService( this.queryGroupStateMap = stateMap; this.queryGroupStateMap.put(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), new QueryGroupState()); taskCancellationService.setQueryGroupStateMapAccessor(this::getQueryGroupState); + clusterService.addStateApplier(this); } /** @@ -126,6 +128,7 @@ protected void doRun() { } taskCancellationService.refreshQueryGroups(activeQueryGroups, deletedQueryGroups); taskCancellationService.cancelTasks(nodeDuressTrackers::isNodeInDuress); + taskCancellationService.pruneDeletedQueryGroups(); } private QueryGroupState getQueryGroupState(final String queryGroupId) { @@ -223,7 +226,7 @@ public QueryGroupStats nodeStats() { * @param queryGroupId query group identifier */ public void rejectIfNeeded(String queryGroupId) { - if (queryGroupId == null) return; + if (queryGroupId == null || queryGroupId.equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) return; QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); // This can happen if the request failed for a deleted query group @@ -232,36 +235,40 @@ public void rejectIfNeeded(String queryGroupId) { return; } - boolean reject = false; - final StringBuilder reason = new StringBuilder(); - // rejections will not happen for SOFT mode QueryGroups - QueryGroup queryGroup = activeQueryGroups.stream().filter(x -> x.get_id().equals(queryGroupId)).findFirst().get(); + Optional optionalQueryGroup = activeQueryGroups.stream().filter(x -> x.get_id().equals(queryGroupId)).findFirst(); - if (queryGroup.getResiliencyMode() == MutableQueryGroupFragment.ResiliencyMode.SOFT) return; + if (optionalQueryGroup.isPresent() && optionalQueryGroup.get().getResiliencyMode() == MutableQueryGroupFragment.ResiliencyMode.SOFT) + return; - for (ResourceType resourceType : TRACKED_RESOURCES) { - if (queryGroup.getResourceLimits().containsKey(resourceType)) { - final double threshold = queryGroup.getResourceLimits().get(resourceType); - final double lastRecordedUsage = queryGroupState.getResourceState().get(resourceType).getLastRecordedUsage(); - if (threshold < lastRecordedUsage) { - reject = true; - reason.append(resourceType) - .append(" limit is breaching for ENFORCED type QueryGroup: (") - .append(threshold) - .append(" < ") - .append(lastRecordedUsage) - .append("). "); - queryGroupState.getResourceState().get(resourceType).rejections.inc(); - // should not double count even if both the resource limits are breaching - break; + optionalQueryGroup.ifPresent(queryGroup -> { + boolean reject = false; + final StringBuilder reason = new StringBuilder(); + for (ResourceType resourceType : TRACKED_RESOURCES) { + if (queryGroup.getResourceLimits().containsKey(resourceType)) { + final double threshold = queryGroup.getResourceLimits().get(resourceType); + final double lastRecordedUsage = queryGroupState.getResourceState().get(resourceType).getLastRecordedUsage(); + if (threshold < lastRecordedUsage) { + reject = true; + reason.append(resourceType) + .append(" limit is breaching for ENFORCED type QueryGroup: (") + .append(threshold) + .append(" < ") + .append(lastRecordedUsage) + .append("). "); + queryGroupState.getResourceState().get(resourceType).rejections.inc(); + // should not double count even if both the resource limits are breaching + break; + } } } - } - if (reject) { - queryGroupState.totalRejections.inc(); - throw new OpenSearchRejectedExecutionException("QueryGroup " + queryGroupId + " is already contended. " + reason.toString()); - } + if (reject) { + queryGroupState.totalRejections.inc(); + throw new OpenSearchRejectedExecutionException( + "QueryGroup " + queryGroupId + " is already contended. " + reason.toString() + ); + } + }); } public Set getActiveQueryGroups() { diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index 2ad0070b75701..69d9d52c16928 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -239,8 +239,25 @@ private QueryGroupState getQueryGroupState(String queryGroupId) { return queryGroupStateAccessor.apply(queryGroupId); } + /** + * sets the current active and deleted query groups + * @param activeQueryGroups + * @param deletedQueryGroups + */ public void refreshQueryGroups(Collection activeQueryGroups, Collection deletedQueryGroups) { this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; } + + /** + * Removes the queryGroups from deleted list if it doesn't have any tasks running + */ + public void pruneDeletedQueryGroups() { + List currentDeletedQueryGroups = new ArrayList<>(deletedQueryGroups); + for (QueryGroup queryGroup : currentDeletedQueryGroups) { + if (queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks().isEmpty()) { + deletedQueryGroups.remove(queryGroup); + } + } + } } diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index b10c8697d6fc6..62c4e21406de4 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -167,7 +167,7 @@ public void testDoRun_WhenModeDisabled() { } - public void testRejectIfNeeded_whenQueryGroupIdIsNull() { + public void testRejectIfNeeded_whenQueryGroupIdIsNullOrDefaultOne() { QueryGroup testQueryGroup = new QueryGroup( "testQueryGroup", "queryGroupId1", @@ -197,6 +197,9 @@ public void testRejectIfNeeded_whenQueryGroupIdIsNull() { queryGroupService.rejectIfNeeded(null); verify(spyMap, never()).get(any()); + + queryGroupService.rejectIfNeeded(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); + verify(spyMap, never()).get(any()); } public void testRejectIfNeeded_whenQueryGroupIsSoftMode() { diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index 05d5de7dfb762..0cb7aebfb6b81 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -24,6 +24,7 @@ import org.opensearch.wlm.tracker.ResourceUsageCalculatorTrackerServiceTests.TestClock; import org.junit.Before; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -32,6 +33,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -518,6 +520,50 @@ public void testGetCancellableTasksFrom_doesNotReturnTasksWhenQueryGroupIdNotFou assertEquals(0, cancellableTasksFrom.size()); } + public void testPruneDeletedQueryGroups() { + QueryGroup queryGroup1 = new QueryGroup( + "testQueryGroup1", + queryGroupId1, + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.2)), + 1L + ); + QueryGroup queryGroup2 = new QueryGroup( + "testQueryGroup2", + queryGroupId2, + new MutableQueryGroupFragment(ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.1)), + 1L + ); + List deletedQueryGroups = new ArrayList<>(); + deletedQueryGroups.add(queryGroup1); + deletedQueryGroups.add(queryGroup2); + QueryGroupLevelResourceUsageView resourceUsageView1 = createResourceUsageViewMock(); + + List activeTasks = IntStream.range(0, 5).mapToObj(this::getRandomSearchTask).collect(Collectors.toList()); + when(resourceUsageView1.getActiveTasks()).thenReturn(activeTasks); + + QueryGroupLevelResourceUsageView resourceUsageView2 = createResourceUsageViewMock(); + when(resourceUsageView2.getActiveTasks()).thenReturn(new ArrayList<>()); + + queryGroupLevelViews.put(queryGroupId1, resourceUsageView1); + queryGroupLevelViews.put(queryGroupId2, resourceUsageView2); + + QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( + workloadManagementSettings, + new MaximumResourceTaskSelectionStrategy(), + resourceUsageTrackerService, + activeQueryGroups, + deletedQueryGroups + ); + taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); + taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; + + taskCancellation.pruneDeletedQueryGroups(); + + assertEquals(1, deletedQueryGroups.size()); + assertEquals(queryGroupId1, deletedQueryGroups.get(0).get_id()); + + } + private QueryGroupLevelResourceUsageView createResourceUsageViewMock() { QueryGroupLevelResourceUsageView mockView = mock(QueryGroupLevelResourceUsageView.class); when(mockView.getActiveTasks()).thenReturn(List.of(getRandomSearchTask(1234), getRandomSearchTask(4321))); From 2292fcd762a51a2dfecee5b5740d886797a45355 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 16 Sep 2024 12:50:31 -0700 Subject: [PATCH 39/47] extend stats and fix bugs Signed-off-by: Kaushal Kumar --- .../org/opensearch/wlm/QueryGroupService.java | 20 ++++++++++++++++--- .../QueryGroupTaskCancellationService.java | 18 ++++++++++++++++- .../opensearch/wlm/stats/QueryGroupState.java | 17 ++++++++++++++-- .../opensearch/wlm/stats/QueryGroupStats.java | 16 ++++++++++++++- .../wlm/QueryGroupServiceTests.java | 19 ++++++++++++++++-- ...ueryGroupTaskCancellationServiceTests.java | 5 +++++ ...eryGroupRequestOperationListenerTests.java | 6 ++++++ .../wlm/stats/QueryGroupStateTests.java | 10 ++++++++-- .../wlm/stats/QueryGroupStatsTests.java | 6 ++++-- 9 files changed, 104 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index ee0eb40adebd2..31743e3db3673 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.opensearch.action.search.SearchShardTask; import org.opensearch.cluster.ClusterChangedEvent; import org.opensearch.cluster.ClusterStateApplier; import org.opensearch.cluster.metadata.Metadata; @@ -230,7 +231,7 @@ public void rejectIfNeeded(String queryGroupId) { QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); // This can happen if the request failed for a deleted query group - // or new queryGroup is being created and has not been acknowledged yet + // or new queryGroup is being created and has not been acknowledged yet or invalid query group id if (queryGroupState == null) { return; } @@ -285,7 +286,20 @@ public void onTaskCompleted(Task task) { return; } final QueryGroupTask queryGroupTask = (QueryGroupTask) task; - final String queryGroupId = queryGroupTask.getQueryGroupId(); - queryGroupStateMap.get(queryGroupId).completions.inc(); + String queryGroupId = queryGroupTask.getQueryGroupId(); + + // set the default queryGroupId if not existing in the active query groups + String finalQueryGroupId = queryGroupId; + boolean exists = activeQueryGroups.stream().anyMatch(queryGroup -> queryGroup.get_id().equals(finalQueryGroupId)); + + if (!exists) { + queryGroupId = QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(); + } + + if (task instanceof SearchShardTask) { + queryGroupStateMap.get(queryGroupId).shardCompletions.inc(); + } else { + queryGroupStateMap.get(queryGroupId).completions.inc(); + } } } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index 69d9d52c16928..bec2b4845d075 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -16,6 +16,7 @@ import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WlmMode; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; @@ -202,7 +203,22 @@ private void cancelTasks(ResiliencyMode resiliencyMode) { } private void cancelTasks(List cancellableTasks) { - cancellableTasks.forEach(TaskCancellation::cancel); + + Consumer cancellationLoggingConsumer = (taskCancellation -> { + log.warn( + "Task {} is eligible for cancellation for reason {}", + taskCancellation.getTask().getId(), + taskCancellation.getReasonString() + ); + }); + Consumer cancellationConsumer = cancellationLoggingConsumer; + if (workloadManagementSettings.getWlmMode() == WlmMode.ENABLED) { + cancellationConsumer = (taskCancellation -> { + cancellationLoggingConsumer.accept(taskCancellation); + taskCancellation.cancel(); + }); + } + cancellableTasks.forEach(cancellationConsumer); } private double getExcessUsage(QueryGroup queryGroup, ResourceType resourceType) { diff --git a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java index b1516fadbcd7f..cbc7046a79464 100644 --- a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java +++ b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupState.java @@ -19,10 +19,15 @@ */ public class QueryGroupState { /** - * completions at the query group level, this is a cumulative counter since the Opensearch start time + * co-ordinator level completions at the query group level, this is a cumulative counter since the Opensearch start time */ public final CounterMetric completions = new CounterMetric(); + /** + * shard level completions at the query group level, this is a cumulative counter since the Opensearch start time + */ + public final CounterMetric shardCompletions = new CounterMetric(); + /** * rejections at the query group level, this is a cumulative counter since the OpenSearch start time */ @@ -54,12 +59,20 @@ public QueryGroupState() { /** * - * @return completions in the query group + * @return co-ordinator completions in the query group */ public long getCompletions() { return completions.count(); } + /** + * + * @return shard completions in the query group + */ + public long getShardCompletions() { + return shardCompletions.count(); + } + /** * * @return rejections in the query group diff --git a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java index 2b389c2167778..8324f7d82a375 100644 --- a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java +++ b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java @@ -8,6 +8,7 @@ package org.opensearch.wlm.stats; +import org.opensearch.Version; import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.core.common.io.stream.Writeable; @@ -91,7 +92,9 @@ public static class QueryGroupStatsHolder implements ToXContentObject, Writeable public static final String REJECTIONS = "rejections"; public static final String TOTAL_CANCELLATIONS = "total_cancellations"; public static final String FAILURES = "failures"; + public static final String SHARD_COMPLETIONS = "shard_completions"; private long completions; + private long shardCompletions; private long rejections; private long failures; private long totalCancellations; @@ -105,11 +108,13 @@ public QueryGroupStatsHolder( long rejections, long failures, long totalCancellations, + long shardCompletions, Map resourceStats ) { this.completions = completions; this.rejections = rejections; this.failures = failures; + this.shardCompletions = shardCompletions; this.totalCancellations = totalCancellations; this.resourceStats = resourceStats; } @@ -119,6 +124,9 @@ public QueryGroupStatsHolder(StreamInput in) throws IOException { this.rejections = in.readVLong(); this.failures = in.readVLong(); this.totalCancellations = in.readVLong(); + if (in.getVersion().onOrAfter(Version.V_2_18_0)) { + this.shardCompletions = in.readVLong(); + } this.resourceStats = in.readMap((i) -> ResourceType.fromName(i.readString()), ResourceStats::new); } @@ -140,6 +148,7 @@ public static QueryGroupStatsHolder from(QueryGroupState queryGroupState) { statsHolder.rejections = queryGroupState.getTotalRejections(); statsHolder.failures = queryGroupState.getFailures(); statsHolder.totalCancellations = queryGroupState.getTotalCancellations(); + statsHolder.shardCompletions = queryGroupState.getShardCompletions(); statsHolder.resourceStats = resourceStatsMap; return statsHolder; } @@ -155,6 +164,9 @@ public static void writeTo(StreamOutput out, QueryGroupStatsHolder statsHolder) out.writeVLong(statsHolder.rejections); out.writeVLong(statsHolder.failures); out.writeVLong(statsHolder.totalCancellations); + if (out.getVersion().onOrAfter(Version.V_2_18_0)) { + out.writeVLong(statsHolder.shardCompletions); + } out.writeMap(statsHolder.resourceStats, (o, val) -> o.writeString(val.getName()), ResourceStats::writeTo); } @@ -166,6 +178,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.field(COMPLETIONS, completions); + builder.field(SHARD_COMPLETIONS, shardCompletions); builder.field(REJECTIONS, rejections); builder.field(FAILURES, failures); builder.field(TOTAL_CANCELLATIONS, totalCancellations); @@ -187,6 +200,7 @@ public boolean equals(Object o) { QueryGroupStatsHolder that = (QueryGroupStatsHolder) o; return completions == that.completions && rejections == that.rejections + && shardCompletions == that.shardCompletions && Objects.equals(resourceStats, that.resourceStats) && failures == that.failures && totalCancellations == that.totalCancellations; @@ -194,7 +208,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return Objects.hash(completions, rejections, totalCancellations, failures, resourceStats); + return Objects.hash(completions, shardCompletions, rejections, totalCancellations, failures, resourceStats); } } diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index 62c4e21406de4..7105df1b7493c 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -8,6 +8,7 @@ package org.opensearch.wlm; +import org.opensearch.action.search.SearchTask; import org.opensearch.cluster.ClusterChangedEvent; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; @@ -322,7 +323,7 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { } public void testOnTaskCompleted() { - Task task = createMockTaskWithResourceStats(QueryGroupTask.class, 100, 200, 0, 12); + Task task = createMockTaskWithResourceStats(SearchTask.class, 100, 200, 0, 12); mockThreadPool = new TestThreadPool("queryGroupServiceTests"); mockThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, "testId"); QueryGroupState queryGroupState = new QueryGroupState(); @@ -334,7 +335,21 @@ public void testOnTaskCompleted() { mockWorkloadManagementSettings, mockNodeDuressTrackers, mockQueryGroupStateMap, - new HashSet<>(), + new HashSet<>() { + { + add( + new QueryGroup( + "testQueryGroup", + "testId", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.10, ResourceType.MEMORY, 0.10) + ), + 1L + ) + ); + } + }, new HashSet<>() ); diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index 0cb7aebfb6b81..4395e0cafac68 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -18,6 +18,7 @@ import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; +import org.opensearch.wlm.WlmMode; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.stats.QueryGroupState; import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; @@ -236,6 +237,7 @@ public void testCancelTasks_cancelsGivenTasks() { assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); + when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -303,6 +305,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); + when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); @@ -374,6 +377,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN assertEquals(1001, cancellableTasksFromDeletedQueryGroups.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); + when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); taskCancellation.cancelTasks(() -> false); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); @@ -435,6 +439,7 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); + when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); taskCancellation.cancelTasks(() -> true); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 81c64e5368186..9dba17ff50a19 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -89,6 +89,7 @@ public void testValidQueryGroupRequestFailure() throws IOException { 0, 1, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), @@ -102,6 +103,7 @@ public void testValidQueryGroupRequestFailure() throws IOException { 0, 0, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), @@ -161,6 +163,7 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { 0, ITERATIONS, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), @@ -174,6 +177,7 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { 0, 0, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), @@ -196,6 +200,7 @@ public void testInvalidQueryGroupFailure() throws IOException { 0, 0, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), @@ -209,6 +214,7 @@ public void testInvalidQueryGroupFailure() throws IOException { 0, 0, 0, + 0, Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats(0, 0, 0), diff --git a/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStateTests.java b/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStateTests.java index 576eec7be1888..566c4261d6878 100644 --- a/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStateTests.java +++ b/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStateTests.java @@ -23,7 +23,13 @@ public void testRandomQueryGroupsStateUpdates() { for (int i = 0; i < 25; i++) { if (i % 5 == 0) { - updaterThreads.add(new Thread(() -> queryGroupState.completions.inc())); + updaterThreads.add(new Thread(() -> { + if (randomBoolean()) { + queryGroupState.completions.inc(); + } else { + queryGroupState.shardCompletions.inc(); + } + })); } else if (i % 5 == 1) { updaterThreads.add(new Thread(() -> { queryGroupState.totalRejections.inc(); @@ -57,7 +63,7 @@ public void testRandomQueryGroupsStateUpdates() { } }); - assertEquals(5, queryGroupState.getCompletions()); + assertEquals(5, queryGroupState.getCompletions() + queryGroupState.getShardCompletions()); assertEquals(5, queryGroupState.getTotalRejections()); final long sumOfRejectionsDueToResourceTypes = queryGroupState.getResourceState().get(ResourceType.CPU).rejections.count() diff --git a/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStatsTests.java b/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStatsTests.java index 661c3a7beae40..ac6d19580dacb 100644 --- a/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStatsTests.java +++ b/server/src/test/java/org/opensearch/wlm/stats/QueryGroupStatsTests.java @@ -28,9 +28,10 @@ public void testToXContent() throws IOException { queryGroupId, new QueryGroupStats.QueryGroupStatsHolder( 123456789, + 13, 2, 0, - 13, + 1213718, Map.of(ResourceType.CPU, new QueryGroupStats.ResourceStats(0.3, 13, 2)) ) ); @@ -40,7 +41,7 @@ public void testToXContent() throws IOException { queryGroupStats.toXContent(builder, ToXContent.EMPTY_PARAMS); builder.endObject(); assertEquals( - "{\"query_groups\":{\"afakjklaj304041-afaka\":{\"completions\":123456789,\"rejections\":2,\"failures\":0,\"total_cancellations\":13,\"cpu\":{\"current_usage\":0.3,\"cancellations\":13,\"rejections\":2}}}}", + "{\"query_groups\":{\"afakjklaj304041-afaka\":{\"completions\":123456789,\"shard_completions\":1213718,\"rejections\":13,\"failures\":2,\"total_cancellations\":0,\"cpu\":{\"current_usage\":0.3,\"cancellations\":13,\"rejections\":2}}}}", builder.toString() ); } @@ -60,6 +61,7 @@ protected QueryGroupStats createTestInstance() { randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong(), + randomNonNegativeLong(), Map.of( ResourceType.CPU, new QueryGroupStats.ResourceStats( From 139404a19f0cf9ae50e280a82e350d088c6b9f2e Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Thu, 19 Sep 2024 14:34:50 -0700 Subject: [PATCH 40/47] fix bugs and add logic to make SBP work with wlm Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 4 +- .../SearchBackpressureService.java | 2 + .../org/opensearch/wlm/QueryGroupService.java | 18 ++++++++- .../wlm/WorkloadManagementSettings.java | 8 ++-- .../SearchBackpressureServiceTests.java | 15 +++++--- .../wlm/QueryGroupServiceTests.java | 38 ++++++++++++++++++- 6 files changed, 72 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index b12c985787e4f..795d0ba17860d 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -1030,10 +1030,10 @@ protected Node( List identityAwarePlugins = pluginsService.filterPlugins(IdentityAwarePlugin.class); identityService.initializeIdentityAwarePlugins(identityAwarePlugins); - QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( + final QueryGroupResourceUsageTrackerService queryGroupResourceUsageTrackerService = new QueryGroupResourceUsageTrackerService( taskResourceTrackingService ); - WorkloadManagementSettings workloadManagementSettings = new WorkloadManagementSettings( + final WorkloadManagementSettings workloadManagementSettings = new WorkloadManagementSettings( settings, settingsModule.getClusterSettings() ); diff --git a/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java b/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java index 43b9f8ae87529..e8e84e5bc6e92 100644 --- a/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java +++ b/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java @@ -42,6 +42,7 @@ import org.opensearch.tasks.TaskResourceTrackingService.TaskCompletionListener; import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; +import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import java.io.IOException; @@ -344,6 +345,7 @@ List getTa .stream() .filter(type::isInstance) .map(type::cast) + .filter(t -> ((QueryGroupTask) t).getQueryGroupId().equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) .collect(Collectors.toUnmodifiableList()); } diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 31743e3db3673..dd57291484a1c 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -227,6 +227,10 @@ public QueryGroupStats nodeStats() { * @param queryGroupId query group identifier */ public void rejectIfNeeded(String queryGroupId) { + if (workloadManagementSettings.getWlmMode() != WlmMode.ENABLED) { + return; + } + if (queryGroupId == null || queryGroupId.equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) return; QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); @@ -247,7 +251,10 @@ public void rejectIfNeeded(String queryGroupId) { final StringBuilder reason = new StringBuilder(); for (ResourceType resourceType : TRACKED_RESOURCES) { if (queryGroup.getResourceLimits().containsKey(resourceType)) { - final double threshold = queryGroup.getResourceLimits().get(resourceType); + final double threshold = getNormalisedRejectionThreshold( + queryGroup.getResourceLimits().get(resourceType), + resourceType + ); final double lastRecordedUsage = queryGroupState.getResourceState().get(resourceType).getLastRecordedUsage(); if (threshold < lastRecordedUsage) { reject = true; @@ -272,6 +279,15 @@ public void rejectIfNeeded(String queryGroupId) { }); } + private double getNormalisedRejectionThreshold(double limit, ResourceType resourceType) { + if (resourceType == ResourceType.CPU) { + return limit * workloadManagementSettings.getNodeLevelCpuRejectionThreshold(); + } else if (resourceType == ResourceType.MEMORY) { + return limit * workloadManagementSettings.getNodeLevelMemoryRejectionThreshold(); + } + throw new IllegalArgumentException(resourceType + " is not supported in WLM yet"); + } + public Set getActiveQueryGroups() { return activeQueryGroups; } diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index f59619f43e118..fcc45f95aed7e 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -38,10 +38,10 @@ public class WorkloadManagementSettings { /** * Setting name for QueryGroupService node duress streak */ - public static final String QUERYGROUP_SERVICE_DURESS_STREAK_SETTING_NAME = "wlm.query_group.service.duress_streak"; + public static final String QUERYGROUP_DURESS_STREAK_SETTING_NAME = "wlm.query_group.duress_streak"; private int duressStreak; public static final Setting QUERYGROUP_SERVICE_DURESS_STREAK_SETTING = Setting.intSetting( - QUERYGROUP_SERVICE_DURESS_STREAK_SETTING_NAME, + QUERYGROUP_DURESS_STREAK_SETTING_NAME, 3, 3, Setting.Property.Dynamic, @@ -51,14 +51,14 @@ public class WorkloadManagementSettings { /** * Setting name for Query Group Service run interval */ - public static final String QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING_NAME = "wlm.query_group.service.run_interval"; + public static final String QUERYGROUP_ENFORCEMENT_INTERVAL_SETTING_NAME = "wlm.query_group.enforcement_interval"; private TimeValue queryGroupServiceRunInterval; /** * Setting to control the run interval of Query Group Service */ public static final Setting QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING = Setting.longSetting( - QUERYGROUP_SERVICE_RUN_INTERVAL_SETTING_NAME, + QUERYGROUP_ENFORCEMENT_INTERVAL_SETTING_NAME, DEFAULT_QUERYGROUP_SERVICE_RUN_INTERVAL_MILLIS, 1000, Setting.Property.Dynamic, diff --git a/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java b/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java index a444eb42eac2e..2a644dfc2f6a4 100644 --- a/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java +++ b/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java @@ -39,6 +39,7 @@ import org.opensearch.test.transport.MockTransportService; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; +import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.junit.After; import org.junit.Before; @@ -261,7 +262,7 @@ public void testSearchTaskInFlightCancellation() { when(settings.getSearchTaskSettings()).thenReturn(searchTaskSettings); // Create a mix of low and high resource usage SearchTasks (50 low + 25 high resource usage tasks). - Map activeSearchTasks = new HashMap<>(); + Map activeSearchTasks = new HashMap<>(); for (long i = 0; i < 75; i++) { if (i % 3 == 0) { activeSearchTasks.put(i, createMockTaskWithResourceStats(SearchTask.class, 500, taskHeapUsageBytes, i)); @@ -269,6 +270,7 @@ public void testSearchTaskInFlightCancellation() { activeSearchTasks.put(i, createMockTaskWithResourceStats(SearchTask.class, 100, taskHeapUsageBytes, i)); } } + activeSearchTasks.values().forEach(task -> task.setQueryGroupId(threadPool.getThreadContext())); doReturn(activeSearchTasks).when(mockTaskResourceTrackingService).getResourceAwareTasks(); // There are 25 SearchTasks eligible for cancellation but only 5 will be cancelled (burst limit). @@ -353,7 +355,7 @@ public void testSearchShardTaskInFlightCancellation() { when(settings.getSearchShardTaskSettings()).thenReturn(searchShardTaskSettings); // Create a mix of low and high resource usage tasks (60 low + 15 high resource usage tasks). - Map activeSearchShardTasks = new HashMap<>(); + Map activeSearchShardTasks = new HashMap<>(); for (long i = 0; i < 75; i++) { if (i % 5 == 0) { activeSearchShardTasks.put(i, createMockTaskWithResourceStats(SearchShardTask.class, 500, taskHeapUsageBytes, i)); @@ -361,6 +363,7 @@ public void testSearchShardTaskInFlightCancellation() { activeSearchShardTasks.put(i, createMockTaskWithResourceStats(SearchShardTask.class, 100, taskHeapUsageBytes, i)); } } + activeSearchShardTasks.values().forEach(task -> task.setQueryGroupId(threadPool.getThreadContext())); doReturn(activeSearchShardTasks).when(mockTaskResourceTrackingService).getResourceAwareTasks(); // There are 15 SearchShardTasks eligible for cancellation but only 10 will be cancelled (burst limit). @@ -451,7 +454,7 @@ public void testNonCancellationOfHeapBasedTasksWhenHeapNotInDuress() { when(settings.getSearchTaskSettings()).thenReturn(searchTaskSettings); // Create a mix of low and high resource usage tasks (60 low + 15 high resource usage tasks). - Map activeSearchTasks = new HashMap<>(); + Map activeSearchTasks = new HashMap<>(); for (long i = 0; i < 75; i++) { if (i % 5 == 0) { activeSearchTasks.put(i, createMockTaskWithResourceStats(SearchTask.class, 500, 800, i)); @@ -459,6 +462,7 @@ public void testNonCancellationOfHeapBasedTasksWhenHeapNotInDuress() { activeSearchTasks.put(i, createMockTaskWithResourceStats(SearchTask.class, 100, 800, i)); } } + activeSearchTasks.values().forEach(task -> task.setQueryGroupId(threadPool.getThreadContext())); doReturn(activeSearchTasks).when(mockTaskResourceTrackingService).getResourceAwareTasks(); // this will trigger cancellation but these cancellation should only be cpu based @@ -549,15 +553,16 @@ public void testNonCancellationWhenSearchTrafficIsNotQualifyingForCancellation() when(settings.getSearchTaskSettings()).thenReturn(searchTaskSettings); // Create a mix of low and high resource usage tasks (60 low + 15 high resource usage tasks). - Map activeSearchTasks = new HashMap<>(); + Map activeSearchTasks = new HashMap<>(); for (long i = 0; i < 75; i++) { - Class taskType = randomBoolean() ? SearchTask.class : SearchShardTask.class; + Class taskType = randomBoolean() ? SearchTask.class : SearchShardTask.class; if (i % 5 == 0) { activeSearchTasks.put(i, createMockTaskWithResourceStats(taskType, 500, 800, i)); } else { activeSearchTasks.put(i, createMockTaskWithResourceStats(taskType, 100, 800, i)); } } + activeSearchTasks.values().forEach(task -> task.setQueryGroupId(threadPool.getThreadContext())); doReturn(activeSearchTasks).when(mockTaskResourceTrackingService).getResourceAwareTasks(); // this will trigger cancellation but the cancellation should not happen as the node is not is duress because of search traffic diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index 7105df1b7493c..4a4520cc671f5 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -251,7 +251,7 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andNotBreaching() { }; mockQueryGroupStateMap = new HashMap<>(); QueryGroupState queryGroupState = new QueryGroupState(); - queryGroupState.getResourceState().get(ResourceType.CPU).setLastRecordedUsage(0.08); + queryGroupState.getResourceState().get(ResourceType.CPU).setLastRecordedUsage(0.05); mockQueryGroupStateMap.put("queryGroupId1", queryGroupState); @@ -265,6 +265,8 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andNotBreaching() { activeQueryGroups, new HashSet<>() ); + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); + when(mockWorkloadManagementSettings.getNodeLevelCpuRejectionThreshold()).thenReturn(0.8); queryGroupService.rejectIfNeeded("queryGroupId1"); // verify the check to compare the current usage and limit @@ -308,6 +310,7 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { activeQueryGroups, new HashSet<>() ); + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); assertThrows(OpenSearchRejectedExecutionException.class, () -> queryGroupService.rejectIfNeeded("queryGroupId1")); // verify the check to compare the current usage and limit @@ -322,6 +325,39 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { assertEquals(1, queryGroupState.totalRejections.count()); } + public void testRejectIfNeeded_whenFeatureIsNotEnabled() { + QueryGroup testQueryGroup = new QueryGroup( + "testQueryGroup", + "queryGroupId1", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.CPU, 0.10)), + 1L + ); + Set activeQueryGroups = new HashSet<>() { + { + add(testQueryGroup); + } + }; + mockQueryGroupStateMap = new HashMap<>(); + mockQueryGroupStateMap.put("queryGroupId1", new QueryGroupState()); + + Map spyMap = spy(mockQueryGroupStateMap); + + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + spyMap, + activeQueryGroups, + new HashSet<>() + ); + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.DISABLED); + + queryGroupService.rejectIfNeeded(testQueryGroup.get_id()); + verify(spyMap, never()).get(any()); + } + public void testOnTaskCompleted() { Task task = createMockTaskWithResourceStats(SearchTask.class, 100, 200, 0, 12); mockThreadPool = new TestThreadPool("queryGroupServiceTests"); From 56c3393d88609244190ca5b5a5ee3fc555879f4b Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 30 Sep 2024 17:33:29 -0700 Subject: [PATCH 41/47] address comments Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 4 +- .../SearchBackpressureService.java | 15 +++-- .../org/opensearch/wlm/QueryGroupService.java | 20 +++++- .../org/opensearch/wlm/QueryGroupTask.java | 5 +- .../wlm/WorkloadManagementSettings.java | 6 +- .../QueryGroupTaskCancellationService.java | 61 +++++++---------- .../SearchBackpressureServiceTests.java | 37 ++++++++-- .../wlm/QueryGroupServiceTests.java | 67 ++++++++++++++++--- ...ueryGroupTaskCancellationServiceTests.java | 56 ++++++---------- 9 files changed, 170 insertions(+), 101 deletions(-) diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 795d0ba17860d..f04dab3663356 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -1210,7 +1210,8 @@ protected Node( searchBackpressureSettings, taskResourceTrackingService, threadPool, - transportService.getTaskManager() + transportService.getTaskManager(), + queryGroupService ); final SegmentReplicationStatsTracker segmentReplicationStatsTracker = new SegmentReplicationStatsTracker(indicesService); @@ -1789,6 +1790,7 @@ private Node stop() { injector.getInstance(FsHealthService.class).stop(); injector.getInstance(NodeResourceUsageTracker.class).stop(); injector.getInstance(ResourceUsageCollectorService.class).stop(); + injector.getInstance(QueryGroupService.class).stop(); nodeService.getMonitorService().stop(); nodeService.getSearchBackpressureService().stop(); injector.getInstance(GatewayService.class).stop(); diff --git a/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java b/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java index e8e84e5bc6e92..466e6b15246ee 100644 --- a/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java +++ b/server/src/main/java/org/opensearch/search/backpressure/SearchBackpressureService.java @@ -42,7 +42,7 @@ import org.opensearch.tasks.TaskResourceTrackingService.TaskCompletionListener; import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; -import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.ResourceType; import java.io.IOException; @@ -87,12 +87,14 @@ public class SearchBackpressureService extends AbstractLifecycleComponent implem private final Map, SearchBackpressureState> searchBackpressureStates; private final TaskManager taskManager; + private final QueryGroupService queryGroupService; public SearchBackpressureService( SearchBackpressureSettings settings, TaskResourceTrackingService taskResourceTrackingService, ThreadPool threadPool, - TaskManager taskManager + TaskManager taskManager, + QueryGroupService queryGroupService ) { this(settings, taskResourceTrackingService, threadPool, System::nanoTime, new NodeDuressTrackers(new EnumMap<>(ResourceType.class) { { @@ -132,7 +134,8 @@ public SearchBackpressureService( settings.getClusterSettings(), SearchShardTaskSettings.SETTING_HEAP_MOVING_AVERAGE_WINDOW_SIZE ), - taskManager + taskManager, + queryGroupService ); } @@ -144,7 +147,8 @@ public SearchBackpressureService( NodeDuressTrackers nodeDuressTrackers, TaskResourceUsageTrackers searchTaskTrackers, TaskResourceUsageTrackers searchShardTaskTrackers, - TaskManager taskManager + TaskManager taskManager, + QueryGroupService queryGroupService ) { this.settings = settings; this.taskResourceTrackingService = taskResourceTrackingService; @@ -152,6 +156,7 @@ public SearchBackpressureService( this.threadPool = threadPool; this.nodeDuressTrackers = nodeDuressTrackers; this.taskManager = taskManager; + this.queryGroupService = queryGroupService; this.searchBackpressureStates = Map.of( SearchTask.class, @@ -345,7 +350,7 @@ List getTa .stream() .filter(type::isInstance) .map(type::cast) - .filter(t -> ((QueryGroupTask) t).getQueryGroupId().equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) + .filter(queryGroupService::shouldSBPHandle) .collect(Collectors.toUnmodifiableList()); } diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index dd57291484a1c..b503c96fb4ed6 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -127,9 +127,8 @@ protected void doRun() { if (workloadManagementSettings.getWlmMode() == WlmMode.DISABLED) { return; } - taskCancellationService.refreshQueryGroups(activeQueryGroups, deletedQueryGroups); - taskCancellationService.cancelTasks(nodeDuressTrackers::isNodeInDuress); - taskCancellationService.pruneDeletedQueryGroups(); + taskCancellationService.cancelTasks(nodeDuressTrackers::isNodeInDuress, activeQueryGroups, deletedQueryGroups); + taskCancellationService.pruneDeletedQueryGroups(deletedQueryGroups); } private QueryGroupState getQueryGroupState(final String queryGroupId) { @@ -296,6 +295,21 @@ public Set getDeletedQueryGroups() { return deletedQueryGroups; } + /** + * This method determines whether the task should be accounted by SBP if both features co-exist + * @param t QueryGroupTask + * @return whether or not SBP handle it + */ + public boolean shouldSBPHandle(Task t) { + QueryGroupTask task = (QueryGroupTask) t; + boolean isInvalidQueryGroupTask = true; + if (!task.getQueryGroupId().equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) { + isInvalidQueryGroupTask = activeQueryGroups.stream() + .noneMatch(queryGroup -> queryGroup.get_id().equals(task.getQueryGroupId())); + } + return workloadManagementSettings.getWlmMode() != WlmMode.ENABLED || isInvalidQueryGroupTask; + } + @Override public void onTaskCompleted(Task task) { if (!(task instanceof QueryGroupTask)) { diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index a1cb766579d43..97a86b5aaa2bf 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -83,7 +83,10 @@ public final String getQueryGroupId() { */ public final void setQueryGroupId(final ThreadContext threadContext) { this.queryGroupId = Optional.ofNullable(threadContext) - .map(threadContext1 -> threadContext1.getHeader(QUERY_GROUP_ID_HEADER)) + .map( + threadContext1 -> Optional.ofNullable(threadContext1.getHeader(QUERY_GROUP_ID_HEADER)) + .orElse(DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()) + ) .orElse(DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); } diff --git a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java index fcc45f95aed7e..af25eedd7eed5 100644 --- a/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java +++ b/server/src/main/java/org/opensearch/wlm/WorkloadManagementSettings.java @@ -184,7 +184,7 @@ public int getDuressStreak() { * node duress streak setter * @param duressStreak new value */ - public void setDuressStreak(int duressStreak) { + private void setDuressStreak(int duressStreak) { this.duressStreak = duressStreak; } @@ -192,7 +192,7 @@ public void setDuressStreak(int duressStreak) { * queryGroupServiceRunInterval setter * @param newIntervalInMillis new value */ - public void setQueryGroupServiceRunInterval(long newIntervalInMillis) { + private void setQueryGroupServiceRunInterval(long newIntervalInMillis) { this.queryGroupServiceRunInterval = TimeValue.timeValueMillis(newIntervalInMillis); } @@ -208,7 +208,7 @@ public TimeValue getQueryGroupServiceRunInterval() { * WlmMode setter * @param mode new mode value */ - public void setWlmMode(final WlmMode mode) { + private void setWlmMode(final WlmMode mode) { this.wlmMode = mode; } diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index bec2b4845d075..f0438ca12c557 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -23,7 +23,6 @@ import java.util.ArrayList; import java.util.Collection; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.function.BooleanSupplier; @@ -59,30 +58,16 @@ public class QueryGroupTaskCancellationService { private final QueryGroupResourceUsageTrackerService resourceUsageTrackerService; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object Map queryGroupLevelResourceUsageViews; - private Collection activeQueryGroups; - private Collection deletedQueryGroups; private Function queryGroupStateAccessor; public QueryGroupTaskCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, QueryGroupResourceUsageTrackerService resourceUsageTrackerService - ) { - this(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, new HashSet<>(), new HashSet<>()); - } - - public QueryGroupTaskCancellationService( - WorkloadManagementSettings workloadManagementSettings, - TaskSelectionStrategy taskSelectionStrategy, - QueryGroupResourceUsageTrackerService resourceUsageTrackerService, - Collection activeQueryGroups, - Collection deletedQueryGroups ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; this.resourceUsageTrackerService = resourceUsageTrackerService; - this.activeQueryGroups = activeQueryGroups; - this.deletedQueryGroups = deletedQueryGroups; } public void setQueryGroupStateMapAccessor(final Function queryGroupStateAccessor) { @@ -92,12 +77,16 @@ public void setQueryGroupStateMapAccessor(final Function activeQueryGroups, + Collection deletedQueryGroups + ) { queryGroupLevelResourceUsageViews = resourceUsageTrackerService.constructQueryGroupLevelUsageViews(); // cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits - cancelTasks(ResiliencyMode.ENFORCED); + cancelTasks(ResiliencyMode.ENFORCED, activeQueryGroups); // if the node is in duress, cancel tasks accordingly. - handleNodeDuress(isNodeInDuress); + handleNodeDuress(isNodeInDuress, activeQueryGroups, deletedQueryGroups); updateResourceUsageInQueryGroupState(); } @@ -113,12 +102,19 @@ private void updateResourceUsageInQueryGroupState() { } } - private void handleNodeDuress(BooleanSupplier isNodeInDuress) { + private void handleNodeDuress( + BooleanSupplier isNodeInDuress, + Collection activeQueryGroups, + Collection deletedQueryGroups + ) { if (!isNodeInDuress.getAsBoolean()) { return; } // List of tasks to be executed in order if the node is in duress - List> duressActions = List.of(v -> cancelTasksFromDeletedQueryGroups(), v -> cancelTasks(ResiliencyMode.SOFT)); + List> duressActions = List.of( + v -> cancelTasksFromDeletedQueryGroups(deletedQueryGroups), + v -> cancelTasks(ResiliencyMode.SOFT, activeQueryGroups) + ); for (Consumer duressAction : duressActions) { if (!isNodeInDuress.getAsBoolean()) { @@ -128,8 +124,8 @@ private void handleNodeDuress(BooleanSupplier isNodeInDuress) { } } - private void cancelTasksFromDeletedQueryGroups() { - cancelTasks(getAllCancellableTasks(this.deletedQueryGroups)); + private void cancelTasksFromDeletedQueryGroups(Collection deletedQueryGroups) { + cancelTasks(getAllCancellableTasks(deletedQueryGroups)); } /** @@ -137,9 +133,9 @@ private void cancelTasksFromDeletedQueryGroups() { * * @return List of tasks that can be cancelled */ - List getAllCancellableTasks(ResiliencyMode resiliencyMode) { + List getAllCancellableTasks(ResiliencyMode resiliencyMode, Collection queryGroups) { return getAllCancellableTasks( - activeQueryGroups.stream().filter(queryGroup -> queryGroup.getResiliencyMode() == resiliencyMode).collect(Collectors.toList()) + queryGroups.stream().filter(queryGroup -> queryGroup.getResiliencyMode() == resiliencyMode).collect(Collectors.toList()) ); } @@ -160,7 +156,6 @@ List getAllCancellableTasks(Collection queryGroups .calculateResourceUsage(selectedTasks); if (excessUsage > MIN_VALUE) { reasons.add(new TaskCancellation.Reason(generateReasonString(queryGroup, resourceType), 1)); - // TODO: We will need to add the cancellation callback for these resources for the queryGroup to reflect stats onCancelCallbacks.add(this.getResourceTypeOnCancelCallback(queryGroup.get_id(), resourceType)); // Only add tasks not already added to avoid double cancellations selectedTasks.addAll( @@ -198,8 +193,8 @@ private List getTasksFor(QueryGroup queryGroup) { return queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(); } - private void cancelTasks(ResiliencyMode resiliencyMode) { - cancelTasks(getAllCancellableTasks(resiliencyMode)); + private void cancelTasks(ResiliencyMode resiliencyMode, Collection queryGroups) { + cancelTasks(getAllCancellableTasks(resiliencyMode, queryGroups)); } private void cancelTasks(List cancellableTasks) { @@ -255,20 +250,10 @@ private QueryGroupState getQueryGroupState(String queryGroupId) { return queryGroupStateAccessor.apply(queryGroupId); } - /** - * sets the current active and deleted query groups - * @param activeQueryGroups - * @param deletedQueryGroups - */ - public void refreshQueryGroups(Collection activeQueryGroups, Collection deletedQueryGroups) { - this.activeQueryGroups = activeQueryGroups; - this.deletedQueryGroups = deletedQueryGroups; - } - /** * Removes the queryGroups from deleted list if it doesn't have any tasks running */ - public void pruneDeletedQueryGroups() { + public void pruneDeletedQueryGroups(Collection deletedQueryGroups) { List currentDeletedQueryGroups = new ArrayList<>(deletedQueryGroups); for (QueryGroup queryGroup : currentDeletedQueryGroups) { if (queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks().isEmpty()) { diff --git a/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java b/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java index 2a644dfc2f6a4..8c4685bee6a0b 100644 --- a/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java +++ b/server/src/test/java/org/opensearch/search/backpressure/SearchBackpressureServiceTests.java @@ -39,6 +39,7 @@ import org.opensearch.test.transport.MockTransportService; import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; +import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.QueryGroupTask; import org.opensearch.wlm.ResourceType; import org.junit.After; @@ -76,10 +77,12 @@ public class SearchBackpressureServiceTests extends OpenSearchTestCase { MockTransportService transportService; TaskManager taskManager; ThreadPool threadPool; + QueryGroupService queryGroupService; @Before public void setup() { threadPool = new TestThreadPool(getClass().getName()); + queryGroupService = mock(QueryGroupService.class); transportService = MockTransportService.createNewService(Settings.EMPTY, Version.CURRENT, threadPool, NoopTracer.INSTANCE); transportService.start(); transportService.acceptIncomingRequests(); @@ -121,9 +124,12 @@ public void testIsNodeInDuress() { new NodeDuressTrackers(duressTrackers), new TaskResourceUsageTrackers(), new TaskResourceUsageTrackers(), - taskManager + taskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + // Node not in duress. cpuUsage.set(0.0); heapUsage.set(0.0); @@ -164,9 +170,12 @@ public void testTrackerStateUpdateOnSearchTaskCompletion() { new NodeDuressTrackers(new EnumMap<>(ResourceType.class)), taskResourceUsageTrackers, new TaskResourceUsageTrackers(), - taskManager + taskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + for (int i = 0; i < 100; i++) { // service.onTaskCompleted(new SearchTask(1, "test", "test", () -> "Test", TaskId.EMPTY_TASK_ID, new HashMap<>())); service.onTaskCompleted(createMockTaskWithResourceStats(SearchTask.class, 100, 200, i)); @@ -195,9 +204,12 @@ public void testTrackerStateUpdateOnSearchShardTaskCompletion() { new NodeDuressTrackers(new EnumMap<>(ResourceType.class)), new TaskResourceUsageTrackers(), taskResourceUsageTrackers, - taskManager + taskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + // Record task completions to update the tracker state. Tasks other than SearchTask & SearchShardTask are ignored. service.onTaskCompleted(createMockTaskWithResourceStats(CancellableTask.class, 100, 200, 101)); for (int i = 0; i < 100; i++) { @@ -247,9 +259,12 @@ public void testSearchTaskInFlightCancellation() { new NodeDuressTrackers(duressTrackers), taskResourceUsageTrackers, new TaskResourceUsageTrackers(), - mockTaskManager + mockTaskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + // Run two iterations so that node is marked 'in duress' from the third iteration onwards. service.doRun(); service.doRun(); @@ -340,9 +355,12 @@ public void testSearchShardTaskInFlightCancellation() { nodeDuressTrackers, new TaskResourceUsageTrackers(), taskResourceUsageTrackers, - mockTaskManager + mockTaskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + // Run two iterations so that node is marked 'in duress' from the third iteration onwards. service.doRun(); service.doRun(); @@ -442,9 +460,12 @@ public void testNonCancellationOfHeapBasedTasksWhenHeapNotInDuress() { nodeDuressTrackers, taskResourceUsageTrackers, new TaskResourceUsageTrackers(), - mockTaskManager + mockTaskManager, + queryGroupService ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); + service.doRun(); service.doRun(); @@ -538,10 +559,12 @@ public void testNonCancellationWhenSearchTrafficIsNotQualifyingForCancellation() nodeDuressTrackers, taskResourceUsageTrackers, new TaskResourceUsageTrackers(), - mockTaskManager + mockTaskManager, + queryGroupService ) ); + when(queryGroupService.shouldSBPHandle(any())).thenReturn(true); when(service.isHeapUsageDominatedBySearch(anyList(), anyDouble())).thenReturn(false); service.doRun(); diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index 4a4520cc671f5..0e4303a1983a2 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -28,6 +28,7 @@ import org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -40,7 +41,6 @@ import static org.opensearch.wlm.tracker.ResourceUsageCalculatorTests.createMockTaskWithResourceStats; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.never; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; @@ -140,16 +140,14 @@ public void testDoStop_CancelsScheduledTask() { public void testDoRun_WhenModeEnabled() { when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); when(mockNodeDuressTrackers.isNodeInDuress()).thenReturn(true); - doNothing().when(mockCancellationService).refreshQueryGroups(any(), any()); // Call the method queryGroupService.doRun(); // Verify that refreshQueryGroups was called - Mockito.verify(mockCancellationService).refreshQueryGroups(any(), any()); // Verify that cancelTasks was called with a BooleanSupplier ArgumentCaptor booleanSupplierCaptor = ArgumentCaptor.forClass(BooleanSupplier.class); - Mockito.verify(mockCancellationService).cancelTasks(booleanSupplierCaptor.capture()); + Mockito.verify(mockCancellationService).cancelTasks(booleanSupplierCaptor.capture(), any(), any()); // Assert the behavior of the BooleanSupplier BooleanSupplier capturedSupplier = booleanSupplierCaptor.getValue(); @@ -162,9 +160,8 @@ public void testDoRun_WhenModeDisabled() { when(mockNodeDuressTrackers.isNodeInDuress()).thenReturn(false); queryGroupService.doRun(); // Verify that refreshQueryGroups was called - Mockito.verify(mockCancellationService, never()).refreshQueryGroups(any(), any()); - Mockito.verify(mockCancellationService, never()).cancelTasks(any()); + Mockito.verify(mockCancellationService, never()).cancelTasks(any(), any(), any()); } @@ -404,6 +401,56 @@ public void testOnTaskCompleted() { mockThreadPool.shutdown(); } + public void testShouldSBPHandle() { + QueryGroupTask task = createMockTaskWithResourceStats(SearchTask.class, 100, 200, 0, 12); + QueryGroupState queryGroupState = new QueryGroupState(); + Set activeQueryGroups = new HashSet<>(); + mockQueryGroupStateMap.put("testId", queryGroupState); + queryGroupService = new QueryGroupService( + mockCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + mockNodeDuressTrackers, + mockQueryGroupStateMap, + activeQueryGroups, + Collections.emptySet() + ); + + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); + + // Default queryGroupId + mockThreadPool = new TestThreadPool("queryGroupServiceTests"); + mockThreadPool.getThreadContext() + .putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); + task.setQueryGroupId(mockThreadPool.getThreadContext()); + assertTrue(queryGroupService.shouldSBPHandle(task)); + + mockThreadPool.shutdownNow(); + + // invalid queryGroup task + mockThreadPool = new TestThreadPool("queryGroupServiceTests"); + mockThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, "testId"); + task.setQueryGroupId(mockThreadPool.getThreadContext()); + assertTrue(queryGroupService.shouldSBPHandle(task)); + + // Valid query group task but wlm not enabled + when(mockWorkloadManagementSettings.getWlmMode()).thenReturn(WlmMode.DISABLED); + activeQueryGroups.add( + new QueryGroup( + "testQueryGroup", + "testId", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.10, ResourceType.MEMORY, 0.10) + ), + 1L + ) + ); + assertTrue(queryGroupService.shouldSBPHandle(task)); + + } + // This is needed to test the behavior of QueryGroupService#doRun method static class TestQueryGroupCancellationService extends QueryGroupTaskCancellationService { public TestQueryGroupCancellationService( @@ -413,11 +460,15 @@ public TestQueryGroupCancellationService( Collection activeQueryGroups, Collection deletedQueryGroups ) { - super(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, activeQueryGroups, deletedQueryGroups); + super(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService); } @Override - public void cancelTasks(BooleanSupplier isNodeInDuress) { + public void cancelTasks( + BooleanSupplier isNodeInDuress, + Collection activeQueryGroups, + Collection deletedQueryGroups + ) { } } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index 4395e0cafac68..c3adda66b9af5 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -66,9 +66,7 @@ public void setup() { taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); } @@ -143,7 +141,7 @@ public void testGetCancellableTasksFrom_returnsTasksWhenBreachingThresholdForMem activeQueryGroups.add(queryGroup1); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -191,12 +189,10 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT, activeQueryGroups); assertEquals(0, cancellableTasksFrom.size()); } @@ -223,22 +219,20 @@ public void testCancelTasks_cancelsGivenTasks() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); - taskCancellation.cancelTasks(() -> false); + taskCancellation.cancelTasks(() -> false, activeQueryGroups, deletedQueryGroups); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); } @@ -287,14 +281,12 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -306,7 +298,7 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); - taskCancellation.cancelTasks(() -> true); + taskCancellation.cancelTasks(() -> true, activeQueryGroups, deletedQueryGroups); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -359,14 +351,12 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); @@ -378,7 +368,7 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); - taskCancellation.cancelTasks(() -> false); + taskCancellation.cancelTasks(() -> false, activeQueryGroups, deletedQueryGroups); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); @@ -420,27 +410,25 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, cancellableTasksFrom.size()); assertEquals(1234, cancellableTasksFrom.get(0).getTask().getId()); assertEquals(4321, cancellableTasksFrom.get(1).getTask().getId()); - List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT); + List cancellableTasksFrom1 = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT, activeQueryGroups); assertEquals(2, cancellableTasksFrom1.size()); assertEquals(5678, cancellableTasksFrom1.get(0).getTask().getId()); assertEquals(8765, cancellableTasksFrom1.get(1).getTask().getId()); when(resourceUsageTrackerService.constructQueryGroupLevelUsageViews()).thenReturn(queryGroupLevelViews); when(workloadManagementSettings.getWlmMode()).thenReturn(WlmMode.ENABLED); - taskCancellation.cancelTasks(() -> true); + taskCancellation.cancelTasks(() -> true, activeQueryGroups, deletedQueryGroups); assertTrue(cancellableTasksFrom.get(0).getTask().isCancelled()); assertTrue(cancellableTasksFrom.get(1).getTask().isCancelled()); assertTrue(cancellableTasksFrom1.get(0).getTask().isCancelled()); @@ -468,7 +456,7 @@ public void testGetAllCancellableTasks_ReturnsNoTasksWhenNotBreachingThresholds( activeQueryGroups.add(queryGroup1); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertTrue(allCancellableTasks.isEmpty()); } @@ -491,7 +479,7 @@ public void testGetAllCancellableTasks_ReturnsTasksWhenBreachingThresholds() { activeQueryGroups.add(queryGroup1); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED); + List allCancellableTasks = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); assertEquals(2, allCancellableTasks.size()); assertEquals(1234, allCancellableTasks.get(0).getTask().getId()); assertEquals(4321, allCancellableTasks.get(1).getTask().getId()); @@ -555,14 +543,12 @@ public void testPruneDeletedQueryGroups() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService, - activeQueryGroups, - deletedQueryGroups + resourceUsageTrackerService ); taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; - taskCancellation.pruneDeletedQueryGroups(); + taskCancellation.pruneDeletedQueryGroups(deletedQueryGroups); assertEquals(1, deletedQueryGroups.size()); assertEquals(queryGroupId1, deletedQueryGroups.get(0).get_id()); From baa39c5c097cba621d3be2d3d1acb77c30914d28 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 1 Oct 2024 14:23:53 -0700 Subject: [PATCH 42/47] fix bugs and SBP ITs Signed-off-by: Kaushal Kumar --- .../backpressure/SearchBackpressureIT.java | 2 ++ .../org/opensearch/wlm/QueryGroupService.java | 2 +- .../QueryGroupTaskCancellationService.java | 16 ++++++++++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java b/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java index fb84134120e00..40c9301ef4bce 100644 --- a/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java @@ -39,6 +39,7 @@ import org.opensearch.test.ParameterizedStaticSettingsOpenSearchIntegTestCase; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportService; +import org.opensearch.wlm.QueryGroupTask; import org.hamcrest.MatcherAssert; import org.junit.After; import org.junit.Before; @@ -411,6 +412,7 @@ protected void doExecute(Task task, TestRequest request, ActionListener { try { CancellableTask cancellableTask = (CancellableTask) task; + ((QueryGroupTask) task).setQueryGroupId(threadPool.getThreadContext()); long startTime = System.nanoTime(); // Doing a busy-wait until task cancellation or timeout. diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index b503c96fb4ed6..df6f6029d5f04 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -132,7 +132,7 @@ protected void doRun() { } private QueryGroupState getQueryGroupState(final String queryGroupId) { - return queryGroupStateMap.get(queryGroupId); + return queryGroupStateMap.getOrDefault(queryGroupId, queryGroupStateMap.get(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())); } /** diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index f0438ca12c557..4ecfa09365a40 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -23,8 +23,10 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.BooleanSupplier; import java.util.function.Consumer; import java.util.function.Function; @@ -88,18 +90,28 @@ public void cancelTasks( // if the node is in duress, cancel tasks accordingly. handleNodeDuress(isNodeInDuress, activeQueryGroups, deletedQueryGroups); - updateResourceUsageInQueryGroupState(); + updateResourceUsageInQueryGroupState(activeQueryGroups); } - private void updateResourceUsageInQueryGroupState() { + private void updateResourceUsageInQueryGroupState(Collection activeQueryGroups) { + Set isSearchWorkloadRunning = new HashSet<>(); for (Map.Entry queryGroupLevelResourceUsageViewEntry : queryGroupLevelResourceUsageViews .entrySet()) { + isSearchWorkloadRunning.add(queryGroupLevelResourceUsageViewEntry.getKey()); QueryGroupState queryGroupState = getQueryGroupState(queryGroupLevelResourceUsageViewEntry.getKey()); TRACKED_RESOURCES.forEach(resourceType -> { final double currentUsage = queryGroupLevelResourceUsageViewEntry.getValue().getResourceUsageData().get(resourceType); queryGroupState.getResourceState().get(resourceType).setLastRecordedUsage(currentUsage); }); } + + activeQueryGroups.forEach(queryGroup -> { + if (!isSearchWorkloadRunning.contains(queryGroup.get_id())) { + TRACKED_RESOURCES.forEach( + resourceType -> getQueryGroupState(queryGroup.get_id()).getResourceState().get(resourceType).setLastRecordedUsage(0.0) + ); + } + }); } private void handleNodeDuress( From 3c6adae6aea7560c23d4456ff355cc6821c1c146 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Tue, 1 Oct 2024 14:40:09 -0700 Subject: [PATCH 43/47] add missed applyCluster state change Signed-off-by: Kaushal Kumar --- .../src/main/java/org/opensearch/wlm/QueryGroupService.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index df6f6029d5f04..ce150b011781a 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -57,7 +57,7 @@ public class QueryGroupService extends AbstractLifecycleComponent private final ThreadPool threadPool; private final ClusterService clusterService; private final WorkloadManagementSettings workloadManagementSettings; - private final Set activeQueryGroups; + private Set activeQueryGroups; private final Set deletedQueryGroups; private final NodeDuressTrackers nodeDuressTrackers; @@ -175,7 +175,6 @@ public void applyClusterState(ClusterChangedEvent event) { // New query group detected QueryGroup newQueryGroup = currentQueryGroups.get(queryGroupName); // Perform any necessary actions with the new query group - this.activeQueryGroups.add(newQueryGroup); queryGroupStateMap.put(newQueryGroup.get_id(), new QueryGroupState()); } } @@ -190,6 +189,7 @@ public void applyClusterState(ClusterChangedEvent event) { queryGroupStateMap.remove(deletedQueryGroup.get_id()); } } + this.activeQueryGroups = new HashSet<>(currentMetadata.queryGroups().values()); } /** From 862af7b60c1bb268385e76927beed60ef3b78ed5 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 7 Oct 2024 10:27:21 -0700 Subject: [PATCH 44/47] address comments Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/wlm/QueryGroupTask.java | 12 +++++------- .../org/opensearch/wlm/stats/QueryGroupStats.java | 9 ++------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java index 97a86b5aaa2bf..97c48bd828978 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupTask.java @@ -17,7 +17,6 @@ import org.opensearch.tasks.CancellableTask; import java.util.Map; -import java.util.Optional; import java.util.function.LongSupplier; import java.util.function.Supplier; @@ -82,12 +81,11 @@ public final String getQueryGroupId() { * @param threadContext current threadContext */ public final void setQueryGroupId(final ThreadContext threadContext) { - this.queryGroupId = Optional.ofNullable(threadContext) - .map( - threadContext1 -> Optional.ofNullable(threadContext1.getHeader(QUERY_GROUP_ID_HEADER)) - .orElse(DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()) - ) - .orElse(DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); + if (threadContext != null && threadContext.getHeader(QUERY_GROUP_ID_HEADER) != null) { + this.queryGroupId = threadContext.getHeader(QUERY_GROUP_ID_HEADER); + } else { + this.queryGroupId = DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(); + } } public long getElapsedTime() { diff --git a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java index 8324f7d82a375..9d74201de252b 100644 --- a/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java +++ b/server/src/main/java/org/opensearch/wlm/stats/QueryGroupStats.java @@ -8,7 +8,6 @@ package org.opensearch.wlm.stats; -import org.opensearch.Version; import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.core.common.io.stream.Writeable; @@ -124,9 +123,7 @@ public QueryGroupStatsHolder(StreamInput in) throws IOException { this.rejections = in.readVLong(); this.failures = in.readVLong(); this.totalCancellations = in.readVLong(); - if (in.getVersion().onOrAfter(Version.V_2_18_0)) { - this.shardCompletions = in.readVLong(); - } + this.shardCompletions = in.readVLong(); this.resourceStats = in.readMap((i) -> ResourceType.fromName(i.readString()), ResourceStats::new); } @@ -164,9 +161,7 @@ public static void writeTo(StreamOutput out, QueryGroupStatsHolder statsHolder) out.writeVLong(statsHolder.rejections); out.writeVLong(statsHolder.failures); out.writeVLong(statsHolder.totalCancellations); - if (out.getVersion().onOrAfter(Version.V_2_18_0)) { - out.writeVLong(statsHolder.shardCompletions); - } + out.writeVLong(statsHolder.shardCompletions); out.writeMap(statsHolder.resourceStats, (o, val) -> o.writeString(val.getName()), ResourceStats::writeTo); } From e2b203c35335f922a2e9c82c310952d0e5aa7413 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 7 Oct 2024 15:24:40 -0700 Subject: [PATCH 45/47] decouple queryGroupService and cancellationService Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/node/Node.java | 9 +++- .../org/opensearch/wlm/QueryGroupService.java | 39 +++++++--------- .../wlm/QueryGroupsStateAccessor.java | 44 +++++++++++++++++++ .../QueryGroupTaskCancellationService.java | 12 +++-- .../wlm/QueryGroupServiceTests.java | 32 ++++++++++---- ...adManagementTransportInterceptorTests.java | 11 ++++- ...ueryGroupTaskCancellationServiceTests.java | 32 ++++++++------ ...eryGroupRequestOperationListenerTests.java | 10 +++-- 8 files changed, 131 insertions(+), 58 deletions(-) create mode 100644 server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index c6e189a7720b1..ddba10369ff05 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -270,6 +270,7 @@ import org.opensearch.usage.UsageService; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.QueryGroupService; +import org.opensearch.wlm.QueryGroupsStateAccessor; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.WorkloadManagementTransportInterceptor; import org.opensearch.wlm.cancellation.MaximumResourceTaskSelectionStrategy; @@ -1034,15 +1035,19 @@ protected Node( settingsModule.getClusterSettings() ); + final QueryGroupsStateAccessor queryGroupsStateAccessor = new QueryGroupsStateAccessor(); + final QueryGroupService queryGroupService = new QueryGroupService( new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - queryGroupResourceUsageTrackerService + queryGroupResourceUsageTrackerService, + queryGroupsStateAccessor ), clusterService, threadPool, - workloadManagementSettings + workloadManagementSettings, + queryGroupsStateAccessor ); taskResourceTrackingService.addTaskCompletionListener(queryGroupService); diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index ce150b011781a..03df6b9f456b0 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -47,9 +47,7 @@ public class QueryGroupService extends AbstractLifecycleComponent implements ClusterStateApplier, TaskResourceTrackingService.TaskCompletionListener { - // This map does not need to be concurrent since we will process the cluster state change serially and update - // this map with new additions and deletions of entries. QueryGroupState is thread safe - private final Map queryGroupStateMap; + private static final Logger logger = LogManager.getLogger(QueryGroupService.class); private final QueryGroupTaskCancellationService taskCancellationService; @@ -60,12 +58,14 @@ public class QueryGroupService extends AbstractLifecycleComponent private Set activeQueryGroups; private final Set deletedQueryGroups; private final NodeDuressTrackers nodeDuressTrackers; + private final QueryGroupsStateAccessor queryGroupsStateAccessor; public QueryGroupService( QueryGroupTaskCancellationService taskCancellationService, ClusterService clusterService, ThreadPool threadPool, - WorkloadManagementSettings workloadManagementSettings + WorkloadManagementSettings workloadManagementSettings, + QueryGroupsStateAccessor queryGroupsStateAccessor ) { this( @@ -90,7 +90,7 @@ public QueryGroupService( ) ) ), - new HashMap<>(), + queryGroupsStateAccessor, new HashSet<>(), new HashSet<>() ); @@ -102,7 +102,7 @@ public QueryGroupService( ThreadPool threadPool, WorkloadManagementSettings workloadManagementSettings, NodeDuressTrackers nodeDuressTrackers, - Map stateMap, + QueryGroupsStateAccessor queryGroupsStateAccessor, Set activeQueryGroups, Set deletedQueryGroups ) { @@ -113,17 +113,16 @@ public QueryGroupService( this.nodeDuressTrackers = nodeDuressTrackers; this.activeQueryGroups = activeQueryGroups; this.deletedQueryGroups = deletedQueryGroups; - activeQueryGroups.forEach(queryGroup -> stateMap.putIfAbsent(queryGroup.get_id(), new QueryGroupState())); - this.queryGroupStateMap = stateMap; - this.queryGroupStateMap.put(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get(), new QueryGroupState()); - taskCancellationService.setQueryGroupStateMapAccessor(this::getQueryGroupState); + this.queryGroupsStateAccessor = queryGroupsStateAccessor; + activeQueryGroups.forEach(queryGroup -> this.queryGroupsStateAccessor.addNewQueryGroup(queryGroup.get_id())); + this.queryGroupsStateAccessor.addNewQueryGroup(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); clusterService.addStateApplier(this); } /** * run at regular interval */ - protected void doRun() { + void doRun() { if (workloadManagementSettings.getWlmMode() == WlmMode.DISABLED) { return; } @@ -131,10 +130,6 @@ protected void doRun() { taskCancellationService.pruneDeletedQueryGroups(deletedQueryGroups); } - private QueryGroupState getQueryGroupState(final String queryGroupId) { - return queryGroupStateMap.getOrDefault(queryGroupId, queryGroupStateMap.get(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())); - } - /** * {@link AbstractLifecycleComponent} lifecycle method */ @@ -175,7 +170,7 @@ public void applyClusterState(ClusterChangedEvent event) { // New query group detected QueryGroup newQueryGroup = currentQueryGroups.get(queryGroupName); // Perform any necessary actions with the new query group - queryGroupStateMap.put(newQueryGroup.get_id(), new QueryGroupState()); + queryGroupsStateAccessor.addNewQueryGroup(newQueryGroup.get_id()); } } @@ -186,7 +181,7 @@ public void applyClusterState(ClusterChangedEvent event) { QueryGroup deletedQueryGroup = previousQueryGroups.get(queryGroupName); // Perform any necessary actions with the deleted query group this.deletedQueryGroups.add(deletedQueryGroup); - queryGroupStateMap.remove(deletedQueryGroup.get_id()); + queryGroupsStateAccessor.removeQueryGroup(deletedQueryGroup.get_id()); } } this.activeQueryGroups = new HashSet<>(currentMetadata.queryGroups().values()); @@ -198,7 +193,7 @@ public void applyClusterState(ClusterChangedEvent event) { * @param queryGroupId query group identifier */ public void incrementFailuresFor(final String queryGroupId) { - QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); + QueryGroupState queryGroupState = queryGroupsStateAccessor.getQueryGroupState(queryGroupId); // This can happen if the request failed for a deleted query group // or new queryGroup is being created and has not been acknowledged yet if (queryGroupState == null) { @@ -212,7 +207,7 @@ public void incrementFailuresFor(final String queryGroupId) { */ public QueryGroupStats nodeStats() { final Map statsHolderMap = new HashMap<>(); - for (Map.Entry queryGroupsState : queryGroupStateMap.entrySet()) { + for (Map.Entry queryGroupsState : queryGroupsStateAccessor.getQueryGroupStateMap().entrySet()) { final String queryGroupId = queryGroupsState.getKey(); final QueryGroupState currentState = queryGroupsState.getValue(); @@ -231,7 +226,7 @@ public void rejectIfNeeded(String queryGroupId) { } if (queryGroupId == null || queryGroupId.equals(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())) return; - QueryGroupState queryGroupState = queryGroupStateMap.get(queryGroupId); + QueryGroupState queryGroupState = queryGroupsStateAccessor.getQueryGroupState(queryGroupId); // This can happen if the request failed for a deleted query group // or new queryGroup is being created and has not been acknowledged yet or invalid query group id @@ -327,9 +322,9 @@ public void onTaskCompleted(Task task) { } if (task instanceof SearchShardTask) { - queryGroupStateMap.get(queryGroupId).shardCompletions.inc(); + queryGroupsStateAccessor.getQueryGroupState(queryGroupId).shardCompletions.inc(); } else { - queryGroupStateMap.get(queryGroupId).completions.inc(); + queryGroupsStateAccessor.getQueryGroupState(queryGroupId).completions.inc(); } } } diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java b/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java new file mode 100644 index 0000000000000..f51d5b78a9ea8 --- /dev/null +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java @@ -0,0 +1,44 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm; + +import org.opensearch.wlm.stats.QueryGroupState; + +import java.util.HashMap; +import java.util.Map; + +public class QueryGroupsStateAccessor { + // This map does not need to be concurrent since we will process the cluster state change serially and update + // this map with new additions and deletions of entries. QueryGroupState is thread safe + private final Map queryGroupStateMap; + + public QueryGroupsStateAccessor() { + this(new HashMap<>()); + } + + public QueryGroupsStateAccessor(Map queryGroupStateMap) { + this.queryGroupStateMap = queryGroupStateMap; + } + + public Map getQueryGroupStateMap() { + return queryGroupStateMap; + } + + public QueryGroupState getQueryGroupState(String queryGroupId) { + return queryGroupStateMap.getOrDefault(queryGroupId, queryGroupStateMap.get(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())); + } + + public void addNewQueryGroup(String queryGroupId) { + this.queryGroupStateMap.putIfAbsent(queryGroupId, new QueryGroupState()); + } + + public void removeQueryGroup(String queryGroupId) { + this.queryGroupStateMap.remove(queryGroupId); + } +} diff --git a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java index 4ecfa09365a40..e82a19c5f7af2 100644 --- a/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java +++ b/server/src/main/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationService.java @@ -15,6 +15,7 @@ import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.QueryGroupsStateAccessor; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WlmMode; import org.opensearch.wlm.WorkloadManagementSettings; @@ -29,7 +30,6 @@ import java.util.Set; import java.util.function.BooleanSupplier; import java.util.function.Consumer; -import java.util.function.Function; import java.util.stream.Collectors; import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES; @@ -60,19 +60,17 @@ public class QueryGroupTaskCancellationService { private final QueryGroupResourceUsageTrackerService resourceUsageTrackerService; // a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object Map queryGroupLevelResourceUsageViews; - private Function queryGroupStateAccessor; + private final QueryGroupsStateAccessor queryGroupStateAccessor; public QueryGroupTaskCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, - QueryGroupResourceUsageTrackerService resourceUsageTrackerService + QueryGroupResourceUsageTrackerService resourceUsageTrackerService, + QueryGroupsStateAccessor queryGroupStateAccessor ) { this.workloadManagementSettings = workloadManagementSettings; this.taskSelectionStrategy = taskSelectionStrategy; this.resourceUsageTrackerService = resourceUsageTrackerService; - } - - public void setQueryGroupStateMapAccessor(final Function queryGroupStateAccessor) { this.queryGroupStateAccessor = queryGroupStateAccessor; } @@ -259,7 +257,7 @@ private Runnable getResourceTypeOnCancelCallback(String queryGroupId, ResourceTy private QueryGroupState getQueryGroupState(String queryGroupId) { assert queryGroupId != null : "queryGroupId should never be null at this point."; - return queryGroupStateAccessor.apply(queryGroupId); + return queryGroupStateAccessor.getQueryGroupState(queryGroupId); } /** diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index 0e4303a1983a2..9c59cf62b85c1 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -56,6 +56,7 @@ public class QueryGroupServiceTests extends OpenSearchTestCase { private Scheduler.Cancellable mockScheduledFuture; private Map mockQueryGroupStateMap; NodeDuressTrackers mockNodeDuressTrackers; + QueryGroupsStateAccessor mockQueryGroupsStateAccessor; public void setUp() throws Exception { super.setUp(); @@ -66,6 +67,7 @@ public void setUp() throws Exception { mockQueryGroupStateMap = new HashMap<>(); mockNodeDuressTrackers = Mockito.mock(NodeDuressTrackers.class); mockCancellationService = Mockito.mock(TestQueryGroupCancellationService.class); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(); queryGroupService = new QueryGroupService( mockCancellationService, @@ -73,7 +75,7 @@ public void setUp() throws Exception { mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - mockQueryGroupStateMap, + mockQueryGroupsStateAccessor, new HashSet<>(), new HashSet<>() ); @@ -178,6 +180,7 @@ public void testRejectIfNeeded_whenQueryGroupIdIsNullOrDefaultOne() { } }; mockQueryGroupStateMap = new HashMap<>(); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); mockQueryGroupStateMap.put("queryGroupId1", new QueryGroupState()); Map spyMap = spy(mockQueryGroupStateMap); @@ -188,7 +191,7 @@ public void testRejectIfNeeded_whenQueryGroupIdIsNullOrDefaultOne() { mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - spyMap, + mockQueryGroupsStateAccessor, activeQueryGroups, new HashSet<>() ); @@ -216,6 +219,8 @@ public void testRejectIfNeeded_whenQueryGroupIsSoftMode() { QueryGroupState spyState = spy(new QueryGroupState()); mockQueryGroupStateMap.put("queryGroupId1", spyState); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); + Map spyMap = spy(mockQueryGroupStateMap); queryGroupService = new QueryGroupService( @@ -224,7 +229,7 @@ public void testRejectIfNeeded_whenQueryGroupIsSoftMode() { mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - spyMap, + mockQueryGroupsStateAccessor, activeQueryGroups, new HashSet<>() ); @@ -252,13 +257,15 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andNotBreaching() { mockQueryGroupStateMap.put("queryGroupId1", queryGroupState); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); + queryGroupService = new QueryGroupService( mockCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - mockQueryGroupStateMap, + mockQueryGroupsStateAccessor, activeQueryGroups, new HashSet<>() ); @@ -295,6 +302,8 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { queryGroupState.getResourceState().get(ResourceType.MEMORY).setLastRecordedUsage(0.18); QueryGroupState spyState = spy(queryGroupState); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); + mockQueryGroupStateMap.put("queryGroupId1", spyState); queryGroupService = new QueryGroupService( @@ -303,7 +312,7 @@ public void testRejectIfNeeded_whenQueryGroupIsEnforcedMode_andBreaching() { mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - mockQueryGroupStateMap, + mockQueryGroupsStateAccessor, activeQueryGroups, new HashSet<>() ); @@ -339,13 +348,15 @@ public void testRejectIfNeeded_whenFeatureIsNotEnabled() { Map spyMap = spy(mockQueryGroupStateMap); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); + queryGroupService = new QueryGroupService( mockCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - spyMap, + mockQueryGroupsStateAccessor, activeQueryGroups, new HashSet<>() ); @@ -361,13 +372,14 @@ public void testOnTaskCompleted() { mockThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, "testId"); QueryGroupState queryGroupState = new QueryGroupState(); mockQueryGroupStateMap.put("testId", queryGroupState); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); queryGroupService = new QueryGroupService( mockCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - mockQueryGroupStateMap, + mockQueryGroupsStateAccessor, new HashSet<>() { { add( @@ -406,13 +418,14 @@ public void testShouldSBPHandle() { QueryGroupState queryGroupState = new QueryGroupState(); Set activeQueryGroups = new HashSet<>(); mockQueryGroupStateMap.put("testId", queryGroupState); + mockQueryGroupsStateAccessor = new QueryGroupsStateAccessor(mockQueryGroupStateMap); queryGroupService = new QueryGroupService( mockCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings, mockNodeDuressTrackers, - mockQueryGroupStateMap, + mockQueryGroupsStateAccessor, activeQueryGroups, Collections.emptySet() ); @@ -457,10 +470,11 @@ public TestQueryGroupCancellationService( WorkloadManagementSettings workloadManagementSettings, TaskSelectionStrategy taskSelectionStrategy, QueryGroupResourceUsageTrackerService resourceUsageTrackerService, + QueryGroupsStateAccessor queryGroupsStateAccessor, Collection activeQueryGroups, Collection deletedQueryGroups ) { - super(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService); + super(workloadManagementSettings, taskSelectionStrategy, resourceUsageTrackerService, queryGroupsStateAccessor); } @Override diff --git a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java index 4d539fa708c42..d4cd7b79455a3 100644 --- a/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java +++ b/server/src/test/java/org/opensearch/wlm/WorkloadManagementTransportInterceptorTests.java @@ -32,6 +32,7 @@ public class WorkloadManagementTransportInterceptorTests extends OpenSearchTestC private WorkloadManagementSettings mockWorkloadManagementSettings; private ThreadPool threadPool; private WorkloadManagementTransportInterceptor sut; + private QueryGroupsStateAccessor stateAccessor; public void setUp() throws Exception { super.setUp(); @@ -40,6 +41,8 @@ public void setUp() throws Exception { mockThreadPool = mock(ThreadPool.class); mockWorkloadManagementSettings = mock(WorkloadManagementSettings.class); threadPool = new TestThreadPool(getTestName()); + stateAccessor = new QueryGroupsStateAccessor(); + ClusterState state = mock(ClusterState.class); Metadata metadata = mock(Metadata.class); when(mockClusterService.state()).thenReturn(state); @@ -47,7 +50,13 @@ public void setUp() throws Exception { when(metadata.queryGroups()).thenReturn(Collections.emptyMap()); sut = new WorkloadManagementTransportInterceptor( threadPool, - new QueryGroupService(mockTaskCancellationService, mockClusterService, mockThreadPool, mockWorkloadManagementSettings) + new QueryGroupService( + mockTaskCancellationService, + mockClusterService, + mockThreadPool, + mockWorkloadManagementSettings, + stateAccessor + ) ); } diff --git a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java index c3adda66b9af5..13e8e2c527073 100644 --- a/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/cancellation/QueryGroupTaskCancellationServiceTests.java @@ -17,6 +17,7 @@ import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode; import org.opensearch.wlm.QueryGroupLevelResourceUsageView; import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.QueryGroupsStateAccessor; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WlmMode; import org.opensearch.wlm.WorkloadManagementSettings; @@ -36,6 +37,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -51,6 +53,7 @@ public class QueryGroupTaskCancellationServiceTests extends OpenSearchTestCase { private QueryGroupTaskCancellationService taskCancellation; private WorkloadManagementSettings workloadManagementSettings; private QueryGroupResourceUsageTrackerService resourceUsageTrackerService; + private QueryGroupsStateAccessor stateAccessor; @Before public void setup() { @@ -63,12 +66,14 @@ public void setup() { when(workloadManagementSettings.getNodeLevelCpuCancellationThreshold()).thenReturn(0.9); when(workloadManagementSettings.getNodeLevelMemoryCancellationThreshold()).thenReturn(0.9); resourceUsageTrackerService = mock(QueryGroupResourceUsageTrackerService.class); + stateAccessor = mock(QueryGroupsStateAccessor.class); + when(stateAccessor.getQueryGroupState(any())).thenReturn(new QueryGroupState()); taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); } public void testGetCancellableTasksFrom_setupAppropriateCancellationReasonAndScore() { @@ -189,7 +194,8 @@ public void testGetCancellableTasksFrom_filtersQueryGroupCorrectly() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.SOFT, activeQueryGroups); @@ -219,9 +225,9 @@ public void testCancelTasks_cancelsGivenTasks() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -281,9 +287,9 @@ public void testCancelTasks_cancelsTasksFromDeletedQueryGroups() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); @@ -351,9 +357,9 @@ public void testCancelTasks_does_not_cancelTasksFromDeletedQueryGroups_whenNodeN QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; List cancellableTasksFrom = taskCancellation.getAllCancellableTasks(ResiliencyMode.ENFORCED, activeQueryGroups); @@ -410,9 +416,9 @@ public void testCancelTasks_cancelsGivenTasks_WhenNodeInDuress() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; @@ -543,9 +549,9 @@ public void testPruneDeletedQueryGroups() { QueryGroupTaskCancellationService taskCancellation = new QueryGroupTaskCancellationService( workloadManagementSettings, new MaximumResourceTaskSelectionStrategy(), - resourceUsageTrackerService + resourceUsageTrackerService, + stateAccessor ); - taskCancellation.setQueryGroupStateMapAccessor((x) -> new QueryGroupState()); taskCancellation.queryGroupLevelResourceUsageViews = queryGroupLevelViews; taskCancellation.pruneDeletedQueryGroups(deletedQueryGroups); diff --git a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java index 9dba17ff50a19..1127b50399d24 100644 --- a/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/wlm/listeners/QueryGroupRequestOperationListenerTests.java @@ -18,6 +18,7 @@ import org.opensearch.threadpool.ThreadPool; import org.opensearch.wlm.QueryGroupService; import org.opensearch.wlm.QueryGroupTask; +import org.opensearch.wlm.QueryGroupsStateAccessor; import org.opensearch.wlm.ResourceType; import org.opensearch.wlm.WorkloadManagementSettings; import org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService; @@ -120,6 +121,7 @@ public void testValidQueryGroupRequestFailure() throws IOException { public void testMultiThreadedValidQueryGroupRequestFailures() { queryGroupStateMap.put(testQueryGroupId, new QueryGroupState()); + QueryGroupsStateAccessor accessor = new QueryGroupsStateAccessor(queryGroupStateMap); setupMockedQueryGroupsFromClusterState(); queryGroupService = new QueryGroupService( taskCancellationService, @@ -127,7 +129,7 @@ public void testMultiThreadedValidQueryGroupRequestFailures() { testThreadPool, mockWorkloadManagementSettings, null, - queryGroupStateMap, + accessor, Collections.emptySet(), Collections.emptySet() ); @@ -212,7 +214,7 @@ public void testInvalidQueryGroupFailure() throws IOException { new QueryGroupStats.QueryGroupStatsHolder( 0, 0, - 0, + 1, 0, 0, Map.of( @@ -235,7 +237,7 @@ private void assertSuccess( QueryGroupStats expectedStats, String threadContextQG_Id ) { - + QueryGroupsStateAccessor stateAccessor = new QueryGroupsStateAccessor(queryGroupStateMap); try (ThreadContext.StoredContext currentContext = testThreadPool.getThreadContext().stashContext()) { testThreadPool.getThreadContext().putHeader(QueryGroupTask.QUERY_GROUP_ID_HEADER, threadContextQG_Id); queryGroupStateMap.put(testQueryGroupId, new QueryGroupState()); @@ -248,7 +250,7 @@ private void assertSuccess( testThreadPool, mockWorkloadManagementSettings, null, - queryGroupStateMap, + stateAccessor, Collections.emptySet(), Collections.emptySet() ); From 623078a6a5db26b9e85a6972f4ae21e93b4b905f Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 7 Oct 2024 15:33:50 -0700 Subject: [PATCH 46/47] replace StateApplier with StateListener interface Signed-off-by: Kaushal Kumar --- .../main/java/org/opensearch/wlm/QueryGroupService.java | 8 ++++---- .../java/org/opensearch/wlm/QueryGroupServiceTests.java | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java index 03df6b9f456b0..cda5916db26f3 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupService.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupService.java @@ -12,7 +12,7 @@ import org.apache.logging.log4j.Logger; import org.opensearch.action.search.SearchShardTask; import org.opensearch.cluster.ClusterChangedEvent; -import org.opensearch.cluster.ClusterStateApplier; +import org.opensearch.cluster.ClusterStateListener; import org.opensearch.cluster.metadata.Metadata; import org.opensearch.cluster.metadata.QueryGroup; import org.opensearch.cluster.service.ClusterService; @@ -45,7 +45,7 @@ */ public class QueryGroupService extends AbstractLifecycleComponent implements - ClusterStateApplier, + ClusterStateListener, TaskResourceTrackingService.TaskCompletionListener { private static final Logger logger = LogManager.getLogger(QueryGroupService.class); @@ -116,7 +116,7 @@ public QueryGroupService( this.queryGroupsStateAccessor = queryGroupsStateAccessor; activeQueryGroups.forEach(queryGroup -> this.queryGroupsStateAccessor.addNewQueryGroup(queryGroup.get_id())); this.queryGroupsStateAccessor.addNewQueryGroup(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get()); - clusterService.addStateApplier(this); + this.clusterService.addListener(this); } /** @@ -155,7 +155,7 @@ protected void doStop() { protected void doClose() throws IOException {} @Override - public void applyClusterState(ClusterChangedEvent event) { + public void clusterChanged(ClusterChangedEvent event) { // Retrieve the current and previous cluster states Metadata previousMetadata = event.previousState().metadata(); Metadata currentMetadata = event.state().metadata(); diff --git a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java index 9c59cf62b85c1..c5cf0dac4f807 100644 --- a/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java +++ b/server/src/test/java/org/opensearch/wlm/QueryGroupServiceTests.java @@ -86,7 +86,7 @@ public void tearDown() throws Exception { mockThreadPool.shutdown(); } - public void testApplyClusterState() { + public void testClusterChanged() { ClusterChangedEvent mockClusterChangedEvent = Mockito.mock(ClusterChangedEvent.class); ClusterState mockPreviousClusterState = Mockito.mock(ClusterState.class); ClusterState mockClusterState = Mockito.mock(ClusterState.class); @@ -115,7 +115,7 @@ public void testApplyClusterState() { when(mockClusterState.metadata()).thenReturn(mockMetadata); when(mockPreviousMetadata.queryGroups()).thenReturn(previousQueryGroups); when(mockMetadata.queryGroups()).thenReturn(currentQueryGroups); - queryGroupService.applyClusterState(mockClusterChangedEvent); + queryGroupService.clusterChanged(mockClusterChangedEvent); Set currentQueryGroupsExpected = Set.of(currentQueryGroups.get("4241")); Set previousQueryGroupsExpected = Set.of(previousQueryGroups.get("4242")); From 2d7316bb2fc92eada5cbeaca487cc8982a0f2778 Mon Sep 17 00:00:00 2001 From: Kaushal Kumar Date: Mon, 7 Oct 2024 16:05:55 -0700 Subject: [PATCH 47/47] fix precommit errors Signed-off-by: Kaushal Kumar --- .../wlm/QueryGroupsStateAccessor.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java b/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java index f51d5b78a9ea8..7f93e41f12092 100644 --- a/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java +++ b/server/src/main/java/org/opensearch/wlm/QueryGroupsStateAccessor.java @@ -13,6 +13,10 @@ import java.util.HashMap; import java.util.Map; +/** + * This class is used to decouple {@link QueryGroupService} and {@link org.opensearch.wlm.cancellation.QueryGroupTaskCancellationService} to share the + * {@link QueryGroupState}s + */ public class QueryGroupsStateAccessor { // This map does not need to be concurrent since we will process the cluster state change serially and update // this map with new additions and deletions of entries. QueryGroupState is thread safe @@ -26,18 +30,34 @@ public QueryGroupsStateAccessor(Map queryGroupStateMap) this.queryGroupStateMap = queryGroupStateMap; } + /** + * returns the query groups state + */ public Map getQueryGroupStateMap() { return queryGroupStateMap; } + /** + * return QueryGroupState for the given queryGroupId + * @param queryGroupId + * @return QueryGroupState for the given queryGroupId, if id is invalid return default query group state + */ public QueryGroupState getQueryGroupState(String queryGroupId) { return queryGroupStateMap.getOrDefault(queryGroupId, queryGroupStateMap.get(QueryGroupTask.DEFAULT_QUERY_GROUP_ID_SUPPLIER.get())); } + /** + * adds new QueryGroupState against given queryGroupId + * @param queryGroupId + */ public void addNewQueryGroup(String queryGroupId) { this.queryGroupStateMap.putIfAbsent(queryGroupId, new QueryGroupState()); } + /** + * removes QueryGroupState against given queryGroupId + * @param queryGroupId + */ public void removeQueryGroup(String queryGroupId) { this.queryGroupStateMap.remove(queryGroupId); }