-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement adaptive remote task request size #10013
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,7 +133,7 @@ public class SqlTaskExecution | |
private final ConcurrentMap<PlanNodeId, TaskSource> unpartitionedSources = new ConcurrentHashMap<>(); | ||
|
||
@GuardedBy("this") | ||
private long maxAcknowledgedSplit = Long.MIN_VALUE; | ||
private final Map<PlanNodeId, Long> maxAcknowledgedSplitByPlanNode = new HashMap<>(); | ||
|
||
@GuardedBy("this") | ||
private final SchedulingLifespanManager schedulingLifespanManager; | ||
|
@@ -325,12 +325,11 @@ private synchronized Map<PlanNodeId, TaskSource> updateSources(List<TaskSource> | |
Map<PlanNodeId, TaskSource> updatedUnpartitionedSources = new HashMap<>(); | ||
|
||
// first remove any split that was already acknowledged | ||
long currentMaxAcknowledgedSplit = this.maxAcknowledgedSplit; | ||
sources = sources.stream() | ||
.map(source -> new TaskSource( | ||
source.getPlanNodeId(), | ||
source.getSplits().stream() | ||
.filter(scheduledSplit -> scheduledSplit.getSequenceId() > currentMaxAcknowledgedSplit) | ||
.filter(scheduledSplit -> scheduledSplit.getSequenceId() > maxAcknowledgedSplitByPlanNode.getOrDefault(source.getPlanNodeId(), Long.MIN_VALUE)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There could be lots of splits. In this case, there will be lots of map accesses. It would be better to do like the below,
|
||
.collect(Collectors.toSet()), | ||
// Like splits, noMoreSplitsForLifespan could be pruned so that only new items will be processed. | ||
// This is not happening here because correctness won't be compromised due to duplicate events for noMoreSplitsForLifespan. | ||
|
@@ -354,11 +353,18 @@ private synchronized Map<PlanNodeId, TaskSource> updateSources(List<TaskSource> | |
} | ||
|
||
// update maxAcknowledgedSplit | ||
maxAcknowledgedSplit = sources.stream() | ||
.flatMap(source -> source.getSplits().stream()) | ||
.mapToLong(ScheduledSplit::getSequenceId) | ||
.max() | ||
.orElse(maxAcknowledgedSplit); | ||
for (TaskSource taskSource : sources) { | ||
long maxAcknowledgedSplit = taskSource.getSplits().stream() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When taskSource.getSplits().size() == 0, it can 'continue;'. |
||
.mapToLong(ScheduledSplit::getSequenceId) | ||
.max() | ||
.orElse(Long.MIN_VALUE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If taskSource.getSplits().size() == 0 is already checked, .orElse(Long.MIN_VALUE) is not needed. Instead of that, just '.getAsLong()'. |
||
PlanNodeId planNodeId = taskSource.getPlanNodeId(); | ||
|
||
if (!maxAcknowledgedSplitByPlanNode.containsKey(planNodeId)) { | ||
maxAcknowledgedSplitByPlanNode.put(planNodeId, Long.MIN_VALUE); | ||
} | ||
Comment on lines
+363
to
+365
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If taskSource.getSplits().size() == 0 is already checked, this is not needed. |
||
maxAcknowledgedSplitByPlanNode.computeIfPresent(planNodeId, (key, val) -> maxAcknowledgedSplit > val ? maxAcknowledgedSplit : val); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If taskSource.getSplits().size() == 0 is already checked, it should throw when val >= maxAcknowledgedSplit. e.g.
|
||
} | ||
return updatedUnpartitionedSources; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
|
||
import com.google.common.collect.HashMultimap; | ||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.ImmutableSet; | ||
import com.google.common.collect.Multimap; | ||
import com.google.common.collect.SetMultimap; | ||
import com.google.common.net.HttpHeaders; | ||
|
@@ -64,6 +65,7 @@ | |
|
||
import java.net.URI; | ||
import java.util.Collection; | ||
import java.util.Comparator; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
@@ -79,6 +81,7 @@ | |
import java.util.concurrent.atomic.AtomicBoolean; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
import java.util.concurrent.atomic.AtomicReference; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import static com.google.common.base.MoreObjects.toStringHelper; | ||
|
@@ -91,7 +94,11 @@ | |
import static io.airlift.http.client.Request.Builder.prepareDelete; | ||
import static io.airlift.http.client.Request.Builder.preparePost; | ||
import static io.airlift.http.client.StaticBodyGenerator.createStaticBodyGenerator; | ||
import static io.trino.SystemSessionProperties.getMaxRemoteTaskRequestSize; | ||
import static io.trino.SystemSessionProperties.getMaxUnacknowledgedSplitsPerTask; | ||
import static io.trino.SystemSessionProperties.getRemoteTaskGuaranteedSplitsPerRequest; | ||
import static io.trino.SystemSessionProperties.getRemoteTaskRequestSizeHeadroom; | ||
import static io.trino.SystemSessionProperties.isEnableAdaptiveTaskRequestSize; | ||
import static io.trino.execution.DynamicFiltersCollector.INITIAL_DYNAMIC_FILTERS_VERSION; | ||
import static io.trino.execution.TaskInfo.createInitialTask; | ||
import static io.trino.execution.TaskState.ABORTED; | ||
|
@@ -172,6 +179,14 @@ public final class HttpRemoteTask | |
private final AtomicBoolean started = new AtomicBoolean(false); | ||
private final AtomicBoolean aborting = new AtomicBoolean(false); | ||
|
||
@GuardedBy("this") | ||
private int splitBatchSize; | ||
|
||
private final int guaranteedSplitsPerRequest; | ||
private final long maxRequestSize; | ||
private final long requestSizeHeadroom; | ||
private final boolean enableAdaptiveTaskRequestSize; | ||
|
||
public HttpRemoteTask( | ||
Session session, | ||
TaskId taskId, | ||
|
@@ -235,6 +250,12 @@ public HttpRemoteTask( | |
} | ||
maxUnacknowledgedSplits = getMaxUnacknowledgedSplitsPerTask(session); | ||
|
||
this.guaranteedSplitsPerRequest = getRemoteTaskGuaranteedSplitsPerRequest(session); | ||
this.maxRequestSize = getMaxRemoteTaskRequestSize(session).toBytes(); | ||
this.requestSizeHeadroom = getRemoteTaskRequestSizeHeadroom(session).toBytes(); | ||
this.splitBatchSize = maxUnacknowledgedSplits; | ||
this.enableAdaptiveTaskRequestSize = isEnableAdaptiveTaskRequestSize(session); | ||
|
||
int pendingSourceSplitCount = 0; | ||
long pendingSourceSplitsWeight = 0; | ||
for (PlanNodeId planNodeId : planFragment.getPartitionedSources()) { | ||
|
@@ -557,6 +578,10 @@ private synchronized void processTaskUpdate(TaskInfo newValue, List<TaskSource> | |
pendingSourceSplitsWeight -= removedWeight; | ||
} | ||
} | ||
// set needsUpdate to true when there are sill pending splits | ||
if (pendingSplits.size() > 0) { | ||
needsUpdate.set(true); | ||
} | ||
// Update node level split tracker before split queue space to ensure it's up to date before waking up the scheduler | ||
partitionedSplitCountTracker.setPartitionedSplits(getPartitionedSplitsInfo()); | ||
updateSplitQueueSpace(); | ||
|
@@ -580,6 +605,27 @@ private synchronized void triggerUpdate() | |
sendUpdate(); | ||
} | ||
|
||
/** | ||
* Adaptively adjust batch size to meet expected request size: | ||
* If requestSize is not equal to expectedSize, this function will try to estimate and adjust the batch size proportionally based on | ||
* current nums of splits and size of request. | ||
*/ | ||
private synchronized void adjustSplitBatchSize(List<TaskSource> sources, long requestSize, long expectedSize) | ||
{ | ||
int numSplits = 0; | ||
for (TaskSource taskSource : sources) { | ||
numSplits = Math.max(numSplits, taskSource.getSplits().size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should find max only for partitioned sources. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With colocation joins with bucketed tables, there could be more than one partitioned sources. In that case, this logic will become awkward. Please apply this logic only when there is one partitioned source. |
||
} | ||
if (requestSize <= 0 || numSplits == 0) { | ||
return; | ||
} | ||
if ((requestSize > expectedSize && splitBatchSize > guaranteedSplitsPerRequest) || (requestSize < expectedSize && splitBatchSize < maxUnacknowledgedSplits)) { | ||
int newSplitBatchSize = (int) (numSplits * ((double) expectedSize / requestSize)); | ||
newSplitBatchSize = Math.max(guaranteedSplitsPerRequest, Math.min(maxUnacknowledgedSplits, newSplitBatchSize)); | ||
splitBatchSize = newSplitBatchSize; | ||
} | ||
} | ||
|
||
private synchronized void sendUpdate() | ||
{ | ||
TaskStatus taskStatus = getTaskStatus(); | ||
|
@@ -614,6 +660,20 @@ private synchronized void sendUpdate() | |
outputBuffers.get(), | ||
dynamicFilterDomains.getDynamicFilterDomains()); | ||
byte[] taskUpdateRequestJson = taskUpdateRequestCodec.toJsonBytes(updateRequest); | ||
|
||
if (enableAdaptiveTaskRequestSize) { | ||
int oldSplitBatchSize = splitBatchSize; | ||
// try to adjust batch size to meet expected request size: (requestSizeLimit - requestSizeLimitHeadroom) | ||
adjustSplitBatchSize(sources, taskUpdateRequestJson.length, maxRequestSize - requestSizeHeadroom); | ||
// abandon current request and reschedule update if size of request body exceeds requestSizeLimit | ||
// and splitBatchSize is updated | ||
if (taskUpdateRequestJson.length > maxRequestSize && splitBatchSize < oldSplitBatchSize) { | ||
log.debug("%s - current taskUpdateRequestJson exceeded limit: %d, abandon.", taskId, taskUpdateRequestJson.length); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. log.info? |
||
scheduleUpdate(); | ||
return; | ||
} | ||
} | ||
|
||
if (fragment.isPresent()) { | ||
stats.updateWithPlanBytes(taskUpdateRequestJson.length); | ||
} | ||
|
@@ -646,21 +706,36 @@ private synchronized void sendUpdate() | |
|
||
private synchronized List<TaskSource> getSources() | ||
{ | ||
return Stream.concat(planFragment.getPartitionedSourceNodes().stream(), planFragment.getRemoteSourceNodes().stream()) | ||
.filter(Objects::nonNull) | ||
.map(PlanNode::getId) | ||
.map(this::getSource) | ||
.filter(Objects::nonNull) | ||
.collect(toImmutableList()); | ||
return Stream.concat( | ||
planFragment.getPartitionedSourceNodes().stream() | ||
.filter(Objects::nonNull) | ||
.map(PlanNode::getId) | ||
.map(planNodeId -> getSource(planNodeId, true)), | ||
planFragment.getRemoteSourceNodes().stream() | ||
.filter(Objects::nonNull) | ||
.map(PlanNode::getId) | ||
.map(planNodeId -> getSource(planNodeId, false)) | ||
).filter(Objects::nonNull).collect(toImmutableList()); | ||
} | ||
|
||
private synchronized TaskSource getSource(PlanNodeId planNodeId) | ||
private synchronized TaskSource getSource(PlanNodeId planNodeId, boolean isPartitionedSource) | ||
{ | ||
Set<ScheduledSplit> splits = pendingSplits.get(planNodeId); | ||
boolean pendingNoMoreSplits = Boolean.TRUE.equals(this.noMoreSplits.get(planNodeId)); | ||
boolean noMoreSplits = this.noMoreSplits.containsKey(planNodeId); | ||
Set<Lifespan> noMoreSplitsForLifespan = pendingNoMoreSplitsForLifespan.get(planNodeId); | ||
|
||
// only apply batchSize to partitioned sources | ||
if (isPartitionedSource && splitBatchSize < splits.size()) { | ||
splits = splits.stream() | ||
.sorted(Comparator.comparingLong(ScheduledSplit::getSequenceId)) | ||
.limit(splitBatchSize) | ||
.collect(Collectors.toSet()); | ||
// if not last batch, we need to defer setting no more splits | ||
noMoreSplits = false; | ||
noMoreSplitsForLifespan = ImmutableSet.of(); | ||
} | ||
|
||
TaskSource element = null; | ||
if (!splits.isEmpty() || !noMoreSplitsForLifespan.isEmpty() || pendingNoMoreSplits) { | ||
element = new TaskSource(planNodeId, splits, noMoreSplitsForLifespan, noMoreSplits); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why isn't it 'true'?