Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cancellation framework changes in wlm #15651

Merged
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a720136
cancellation related
kiranprakash154 Aug 7, 2024
83e20c0
Update CHANGELOG.md
kiranprakash154 Aug 7, 2024
9983c73
add better cancellation reason
kiranprakash154 Aug 7, 2024
245ee5d
Update DefaultTaskCancellationTests.java
kiranprakash154 Aug 7, 2024
0771fd2
refactor
kiranprakash154 Aug 21, 2024
4b1ef81
refactor
kiranprakash154 Aug 26, 2024
3ea44d7
Update DefaultTaskCancellation.java
kiranprakash154 Aug 27, 2024
0103089
Update DefaultTaskCancellation.java
kiranprakash154 Aug 29, 2024
092d715
Update DefaultTaskCancellation.java
kiranprakash154 Aug 29, 2024
4a2c51e
Update DefaultTaskSelectionStrategy.java
kiranprakash154 Aug 29, 2024
cbb51bd
refactor
kiranprakash154 Aug 29, 2024
4e846e2
refactor node level threshold
kiranprakash154 Aug 29, 2024
241b036
Merge branch 'main' into kp/wlm-cancellation-1
kaushalmahi12 Aug 30, 2024
7511d99
use query group task
kaushalmahi12 Aug 30, 2024
498743a
code clean up and refactorings
kaushalmahi12 Sep 3, 2024
e26e525
add unit tests and fix existing ones
kaushalmahi12 Sep 4, 2024
0ff2b09
uncomment the test case
kaushalmahi12 Sep 4, 2024
ddb8dce
update CHANGELOG
kaushalmahi12 Sep 4, 2024
3528054
Merge branch 'main' into feature/wlm-cancellation
kaushalmahi12 Sep 4, 2024
e8366a5
fix imports
kaushalmahi12 Sep 4, 2024
448ea41
refactor and add UTs for new constructs
kaushalmahi12 Sep 5, 2024
3fc21be
fix javadocs
kaushalmahi12 Sep 5, 2024
fe02a6a
remove code clutter
kaushalmahi12 Sep 6, 2024
8aede33
change annotation version and task selection strategy
kaushalmahi12 Sep 6, 2024
623f6f8
rename a util class
kaushalmahi12 Sep 6, 2024
9e2e3ea
remove wrappers from resource type
kaushalmahi12 Sep 6, 2024
34184ef
apply spotless
kaushalmahi12 Sep 6, 2024
91893e7
address comments
kaushalmahi12 Sep 9, 2024
66e43b2
add rename changes
kaushalmahi12 Sep 9, 2024
a6b1afd
Merge branch 'main' into feature/wlm-cancellation
kaushalmahi12 Sep 9, 2024
981b15f
address comments
kaushalmahi12 Sep 9, 2024
caf5914
refactor changes and logical bug fix
kaushalmahi12 Sep 10, 2024
b78ca02
address comments
kaushalmahi12 Sep 11, 2024
7bb6b2c
Merge branch 'main' into feature/wlm-cancellation
jainankitk Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [Workload Management] Add Get QueryGroup API Logic ([14709](https://github.com/opensearch-project/OpenSearch/pull/14709))
- [Workload Management] Add Settings for Workload Management feature ([#15028](https://github.com/opensearch-project/OpenSearch/pull/15028))
- [Workload Management] Add Update QueryGroup API Logic ([#14775](https://github.com/opensearch-project/OpenSearch/pull/14775))
- [Workload Management] QueryGroup resource cancellation framework changes ([#15651](https://github.com/opensearch-project/OpenSearch/pull/15651))
- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
- Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774))
- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

package org.opensearch.wlm;

import org.opensearch.tasks.Task;
import org.opensearch.wlm.tracker.QueryGroupResourceUsage;

import java.util.List;
import java.util.Map;
Expand All @@ -20,11 +20,11 @@
*/
public class QueryGroupLevelResourceUsageView {
// resourceUsage holds the resource usage data for a QueryGroup at a point in time
private final Map<ResourceType, Long> resourceUsage;
private final Map<ResourceType, QueryGroupResourceUsage> resourceUsage;
// activeTasks holds the list of active tasks for a QueryGroup at a point in time
private final List<Task> activeTasks;
private final List<QueryGroupTask> activeTasks;

public QueryGroupLevelResourceUsageView(Map<ResourceType, Long> resourceUsage, List<Task> activeTasks) {
public QueryGroupLevelResourceUsageView(Map<ResourceType, QueryGroupResourceUsage> resourceUsage, List<QueryGroupTask> activeTasks) {
this.resourceUsage = resourceUsage;
this.activeTasks = activeTasks;
}
Expand All @@ -34,7 +34,7 @@ public QueryGroupLevelResourceUsageView(Map<ResourceType, Long> resourceUsage, L
*
* @return The map of resource usage data
*/
public Map<ResourceType, Long> getResourceUsageData() {
public Map<ResourceType, QueryGroupResourceUsage> getResourceUsageData() {
return resourceUsage;
}

Expand All @@ -43,7 +43,7 @@ public Map<ResourceType, Long> getResourceUsageData() {
*
* @return The list of active tasks
*/
public List<Task> getActiveTasks() {
public List<QueryGroupTask> getActiveTasks() {
return activeTasks;
}
}
21 changes: 3 additions & 18 deletions server/src/main/java/org/opensearch/wlm/ResourceType.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,9 @@

import org.opensearch.common.annotation.PublicApi;
import org.opensearch.core.common.io.stream.StreamOutput;
import org.opensearch.core.tasks.resourcetracker.ResourceStats;
import org.opensearch.tasks.Task;

import java.io.IOException;
import java.util.List;
import java.util.function.Function;

/**
* Enum to hold the resource type
Expand All @@ -24,18 +21,16 @@
*/
@PublicApi(since = "2.17.0")
public enum ResourceType {
CPU("cpu", task -> task.getTotalResourceUtilization(ResourceStats.CPU), true),
MEMORY("memory", task -> task.getTotalResourceUtilization(ResourceStats.MEMORY), true);
CPU("cpu", true),
MEMORY("memory", true);

private final String name;
private final Function<Task, Long> getResourceUsage;
private final boolean statsEnabled;

private static List<ResourceType> sortedValues = List.of(CPU, MEMORY);

ResourceType(String name, Function<Task, Long> getResourceUsage, boolean statsEnabled) {
ResourceType(String name, boolean statsEnabled) {
this.name = name;
this.getResourceUsage = getResourceUsage;
this.statsEnabled = statsEnabled;
}

Expand All @@ -61,16 +56,6 @@ public String getName() {
return name;
}

/**
* Gets the resource usage for a given resource type and task.
*
* @param task the task for which to calculate resource usage
* @return the resource usage
*/
public long getResourceUsage(Task task) {
return getResourceUsage.apply(task);
}

public boolean hasStatsEnabled() {
return statsEnabled;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.wlm.cancellation;

import org.opensearch.cluster.metadata.QueryGroup;
import org.opensearch.tasks.CancellableTask;
import org.opensearch.tasks.TaskCancellation;
import org.opensearch.wlm.MutableQueryGroupFragment.ResiliencyMode;
import org.opensearch.wlm.QueryGroupLevelResourceUsageView;
import org.opensearch.wlm.QueryGroupTask;
import org.opensearch.wlm.ResourceType;
import org.opensearch.wlm.WorkloadManagementSettings;
import org.opensearch.wlm.tracker.QueryGroupResourceUsage;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.function.BooleanSupplier;
import java.util.function.Consumer;
import java.util.stream.Collectors;

import static org.opensearch.wlm.tracker.QueryGroupResourceUsageTrackerService.TRACKED_RESOURCES;

/**
* Manages the cancellation of tasks enforced by QueryGroup thresholds on resource usage criteria.
* This class utilizes a strategy pattern through {@link DefaultTaskSelectionStrategy} to identify tasks that exceed
* predefined resource usage limits and are therefore eligible for cancellation.
*
* <p>The cancellation process is initiated by evaluating the resource usage of each QueryGroup against its
* resource limits. Tasks that contribute to exceeding these limits are selected for cancellation based on the
* implemented task selection strategy.</p>
*
* <p>Instances of this class are configured with a map linking QueryGroup IDs to their corresponding resource usage
* views, a set of active QueryGroups, and a task selection strategy. These components collectively facilitate the
* identification and cancellation of tasks that threaten to breach QueryGroup resource limits.</p>
*
* @see DefaultTaskSelectionStrategy
* @see QueryGroup
* @see ResourceType
*/
public class DefaultTaskCancellation {
kaushalmahi12 marked this conversation as resolved.
Show resolved Hide resolved
public static final double MIN_VALUE = 1e-9;

protected final WorkloadManagementSettings workloadManagementSettings;
protected final DefaultTaskSelectionStrategy defaultTaskSelectionStrategy;
kaushalmahi12 marked this conversation as resolved.
Show resolved Hide resolved
// a map of QueryGroupId to its corresponding QueryGroupLevelResourceUsageView object
protected final Map<String, QueryGroupLevelResourceUsageView> queryGroupLevelResourceUsageViews;
protected final Collection<QueryGroup> activeQueryGroups;
protected final Collection<QueryGroup> deletedQueryGroups;
protected BooleanSupplier isNodeInDuress;

public DefaultTaskCancellation(
WorkloadManagementSettings workloadManagementSettings,
DefaultTaskSelectionStrategy defaultTaskSelectionStrategy,
Map<String, QueryGroupLevelResourceUsageView> queryGroupLevelResourceUsageViews,
Collection<QueryGroup> activeQueryGroups,
Collection<QueryGroup> deletedQueryGroups,
BooleanSupplier isNodeInDuress
) {
this.workloadManagementSettings = workloadManagementSettings;
this.defaultTaskSelectionStrategy = defaultTaskSelectionStrategy;
this.queryGroupLevelResourceUsageViews = queryGroupLevelResourceUsageViews;
this.activeQueryGroups = activeQueryGroups;
this.deletedQueryGroups = deletedQueryGroups;
this.isNodeInDuress = isNodeInDuress;
}

/**
* Cancel tasks based on the implemented strategy.
*/
public final void cancelTasks() {
// cancel tasks from QueryGroups that are in Enforced mode that are breaching their resource limits
cancelTasks(ResiliencyMode.ENFORCED);
// if the node is in duress, cancel tasks accordingly.
handleNodeDuress();
}

private void handleNodeDuress() {
if (!isNodeInDuress.getAsBoolean()) {
return;
}
// List of tasks to be executed in order if the node is in duress
List<Consumer<Void>> duressActions = List.of(v -> cancelTasksFromDeletedQueryGroups(), v -> cancelTasks(ResiliencyMode.SOFT));

for (Consumer<Void> duressAction : duressActions) {
if (!isNodeInDuress.getAsBoolean()) {
break;
}
duressAction.accept(null);
}
}

private void cancelTasksFromDeletedQueryGroups() {
cancelTasks(getAllCancellableTasks(this.deletedQueryGroups));
}

/**
* Get all cancellable tasks from the QueryGroups.
*
* @return List of tasks that can be cancelled
*/
protected List<TaskCancellation> getAllCancellableTasks(ResiliencyMode resiliencyMode) {
return getAllCancellableTasks(getQueryGroupsToCancelFrom(resiliencyMode));
}

/**
* Get all cancellable tasks from the given QueryGroups.
*
* @return List of tasks that can be cancelled
*/
protected List<TaskCancellation> getAllCancellableTasks(Collection<QueryGroup> queryGroups) {
return queryGroups.stream().flatMap(queryGroup -> getCancellableTasksFrom(queryGroup).stream()).collect(Collectors.toList());
}

/**
* returns the list of QueryGroups breaching their resource limits.
*
* @return List of QueryGroups
*/
private List<QueryGroup> getQueryGroupsToCancelFrom(ResiliencyMode resiliencyMode) {
final List<QueryGroup> queryGroupsToCancelFrom = new ArrayList<>();

for (QueryGroup queryGroup : this.activeQueryGroups) {
if (queryGroup.getResiliencyMode() != resiliencyMode) {
continue;
}
Map<ResourceType, QueryGroupResourceUsage> queryGroupResourcesUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id())
.getResourceUsageData();

for (ResourceType resourceType : TRACKED_RESOURCES) {
if (queryGroup.getResourceLimits().containsKey(resourceType)) {
final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupResourcesUsage.get(resourceType);
if (queryGroupResourceUsage.isBreachingThresholdFor(queryGroup, workloadManagementSettings)) {
queryGroupsToCancelFrom.add(queryGroup);
break;
}

}
}
}

return queryGroupsToCancelFrom;
}

private void cancelTasks(ResiliencyMode resiliencyMode) {
cancelTasks(getAllCancellableTasks(resiliencyMode));
}

private void cancelTasks(List<TaskCancellation> cancellableTasks) {
cancellableTasks.forEach(TaskCancellation::cancel);
}

/**
* Get cancellable tasks from a specific queryGroup.
*
* @param queryGroup The QueryGroup from which to get cancellable tasks
* @return List of tasks that can be cancelled
*/
protected List<TaskCancellation> getCancellableTasksFrom(QueryGroup queryGroup) {
return TRACKED_RESOURCES.stream()
.filter(resourceType -> shouldCancelTasks(queryGroup, resourceType))
.flatMap(resourceType -> getTaskCancellations(queryGroup, resourceType).stream())
.collect(Collectors.toList());
}

private boolean shouldCancelTasks(QueryGroup queryGroup, ResourceType resourceType) {
if (queryGroup == null || !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) {
return false;
}
QueryGroupLevelResourceUsageView queryGroupResourceUsageView = queryGroupLevelResourceUsageViews.get(queryGroup.get_id());
return queryGroupResourceUsageView.getResourceUsageData()
.get(resourceType)
.isBreachingThresholdFor(queryGroup, workloadManagementSettings);
}

private List<TaskCancellation> getTaskCancellations(QueryGroup queryGroup, ResourceType resourceType) {
List<QueryGroupTask> selectedTasksToCancel = defaultTaskSelectionStrategy.selectTasksForCancellation(
queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks(),
getReduceBy(queryGroup, resourceType),
resourceType
);
List<TaskCancellation> taskCancellations = new ArrayList<>();
for (QueryGroupTask task : selectedTasksToCancel) {
String cancellationReason = createCancellationReason(queryGroup, task, resourceType);
taskCancellations.add(createTaskCancellation(task, cancellationReason));
}
return taskCancellations;
}

private String createCancellationReason(QueryGroup querygroup, QueryGroupTask task, ResourceType resourceType) {
Double thresholdInPercent = getThresholdInPercent(querygroup, resourceType);
return "[Workload Management] Cancelling Task ID : "
+ task.getId()
+ " from QueryGroup ID : "
+ querygroup.get_id()
+ " breached the resource limit of : "
+ thresholdInPercent
+ " for resource type : "
+ resourceType.getName();
}

private Double getThresholdInPercent(QueryGroup querygroup, ResourceType resourceType) {
return querygroup.getResourceLimits().get(resourceType) * 100;
}

private TaskCancellation createTaskCancellation(CancellableTask task, String cancellationReason) {
return new TaskCancellation(task, List.of(new TaskCancellation.Reason(cancellationReason, 5)), List.of(this::callbackOnCancel));
}

protected List<TaskCancellation> getTaskCancellationsForDeletedQueryGroup(QueryGroup queryGroup) {
List<QueryGroupTask> tasks = queryGroupLevelResourceUsageViews.get(queryGroup.get_id()).getActiveTasks();

List<TaskCancellation> taskCancellations = new ArrayList<>();
for (QueryGroupTask task : tasks) {
String cancellationReason = "[Workload Management] Cancelling Task ID : "
+ task.getId()
+ " from QueryGroup ID : "
+ queryGroup.get_id();
taskCancellations.add(createTaskCancellation(task, cancellationReason));
}
return taskCancellations;
}

private double getReduceBy(QueryGroup queryGroup, ResourceType resourceType) {
if (queryGroup.getResourceLimits().get(resourceType) == null
|| !queryGroupLevelResourceUsageViews.containsKey(queryGroup.get_id())) {
return 0;
}
final QueryGroupLevelResourceUsageView queryGroupLevelResourceUsage = queryGroupLevelResourceUsageViews.get(queryGroup.get_id());
final QueryGroupResourceUsage queryGroupResourceUsage = queryGroupLevelResourceUsage.getResourceUsageData().get(resourceType);
return queryGroupResourceUsage.getReduceByFor(queryGroup, workloadManagementSettings);
}

private void callbackOnCancel() {
// TODO Implement callback logic here mostly used for Stats
}
}
Loading
Loading