From 01cfef4aff29d0e09f83bfd37a874e5ffbfb24cb Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Wed, 3 Jul 2024 19:16:48 -0700 Subject: [PATCH 1/7] add getdata clients and shareable logic for GetData w/ throttling and metric tracking. --- .../MetricTrackingWindmillServerStub.java | 355 ----------------- .../worker/StreamingDataflowWorker.java | 32 +- .../windmill/ApplianceWindmillClient.java | 39 ++ .../StreamingEngineWindmillClient.java | 54 +++ .../worker/windmill/WindmillServerBase.java | 5 - .../worker/windmill/WindmillServerStub.java | 58 +-- .../getdata/ApplianceGetDataClient.java | 270 +++++++++++++ .../client/getdata/GetDataClient.java | 46 +++ .../getdata/StreamingEngineGetDataClient.java | 146 +++++++ .../ThrottlingGetDataMetricTracker.java | 149 +++++++ .../client/grpc/GrpcWindmillServer.java | 5 - .../dataflow/worker/FakeWindmillServer.java | 23 +- .../worker/StreamingDataflowWorkerTest.java | 5 - .../sideinput/SideInputStateFetcherTest.java | 4 +- .../ThrottlingGetDataMetricTrackerTest.java | 374 ++++++++++++++++++ .../state/WindmillStateReaderTest.java | 4 +- 16 files changed, 1108 insertions(+), 461 deletions(-) delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java deleted file mode 100644 index d808d4f4ab589..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker; - -import com.google.auto.value.AutoBuilder; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.atomic.AtomicInteger; -import javax.annotation.concurrent.GuardedBy; -import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Duration; - -/** - * Wrapper around a {@link WindmillServerStub} that tracks metrics for the number of in-flight - * requests and throttles requests when memory pressure is high. - * - *

External API: individual worker threads request state for their computation via {@link - * #getStateData}. However, requests are either issued using a pool of streaming rpcs or possibly - * batched requests. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class MetricTrackingWindmillServerStub { - - private static final int MAX_READS_PER_BATCH = 60; - private static final int MAX_ACTIVE_READS = 10; - private static final Duration STREAM_TIMEOUT = Duration.standardSeconds(30); - private final AtomicInteger activeSideInputs = new AtomicInteger(); - private final AtomicInteger activeStateReads = new AtomicInteger(); - private final AtomicInteger activeHeartbeats = new AtomicInteger(); - private final WindmillServerStub server; - private final MemoryMonitor gcThrashingMonitor; - private final boolean useStreamingRequests; - - private final WindmillStreamPool getDataStreamPool; - - // This may be the same instance as getDataStreamPool based upon options. - private final WindmillStreamPool heartbeatStreamPool; - - @GuardedBy("this") - private final List pendingReadBatches; - - @GuardedBy("this") - private int activeReadThreads = 0; - - @Internal - @AutoBuilder(ofClass = MetricTrackingWindmillServerStub.class) - public abstract static class Builder { - - abstract Builder setServer(WindmillServerStub server); - - abstract Builder setGcThrashingMonitor(MemoryMonitor gcThrashingMonitor); - - abstract Builder setUseStreamingRequests(boolean useStreamingRequests); - - abstract Builder setUseSeparateHeartbeatStreams(boolean useSeparateHeartbeatStreams); - - abstract Builder setNumGetDataStreams(int numGetDataStreams); - - abstract MetricTrackingWindmillServerStub build(); - } - - public static Builder builder(WindmillServerStub server, MemoryMonitor gcThrashingMonitor) { - return new AutoBuilder_MetricTrackingWindmillServerStub_Builder() - .setServer(server) - .setGcThrashingMonitor(gcThrashingMonitor) - .setUseStreamingRequests(false) - .setUseSeparateHeartbeatStreams(false) - .setNumGetDataStreams(1); - } - - MetricTrackingWindmillServerStub( - WindmillServerStub server, - MemoryMonitor gcThrashingMonitor, - boolean useStreamingRequests, - boolean useSeparateHeartbeatStreams, - int numGetDataStreams) { - this.server = server; - this.gcThrashingMonitor = gcThrashingMonitor; - this.useStreamingRequests = useStreamingRequests; - if (useStreamingRequests) { - getDataStreamPool = - WindmillStreamPool.create( - Math.max(1, numGetDataStreams), STREAM_TIMEOUT, this.server::getDataStream); - if (useSeparateHeartbeatStreams) { - heartbeatStreamPool = - WindmillStreamPool.create(1, STREAM_TIMEOUT, this.server::getDataStream); - } else { - heartbeatStreamPool = getDataStreamPool; - } - } else { - getDataStreamPool = heartbeatStreamPool = null; - } - // This is used as a queue but is expected to be less than 10 batches. - this.pendingReadBatches = new ArrayList<>(); - } - - // Adds the entry to a read batch for sending to the windmill server. If a non-null batch is - // returned, this thread will be responsible for sending the batch and should wait for the batch - // startRead to be notified. - // If null is returned, the entry was added to a read batch that will be issued by another thread. - private @Nullable ReadBatch addToReadBatch(QueueEntry entry) { - synchronized (this) { - ReadBatch batch; - if (activeReadThreads < MAX_ACTIVE_READS) { - assert (pendingReadBatches.isEmpty()); - activeReadThreads += 1; - // fall through to below synchronized block - } else if (pendingReadBatches.isEmpty() - || pendingReadBatches.get(pendingReadBatches.size() - 1).reads.size() - >= MAX_READS_PER_BATCH) { - // This is the first read of a batch, it will be responsible for sending the batch. - batch = new ReadBatch(); - pendingReadBatches.add(batch); - batch.reads.add(entry); - return batch; - } else { - // This fits within an existing batch, it will be sent by the first blocking thread in the - // batch. - pendingReadBatches.get(pendingReadBatches.size() - 1).reads.add(entry); - return null; - } - } - ReadBatch batch = new ReadBatch(); - batch.reads.add(entry); - batch.startRead.set(true); - return batch; - } - - private void issueReadBatch(ReadBatch batch) { - try { - boolean read = batch.startRead.get(); - assert (read); - } catch (InterruptedException e) { - // We don't expect this thread to be interrupted. To simplify handling, we just fall through - // to issuing - // the call. - assert (false); - Thread.currentThread().interrupt(); - } catch (ExecutionException e) { - // startRead is a SettableFuture so this should never occur. - throw new AssertionError("Should not have exception on startRead", e); - } - Map> pendingResponses = - new HashMap<>(batch.reads.size()); - Map computationBuilders = new HashMap<>(); - for (QueueEntry entry : batch.reads) { - Windmill.ComputationGetDataRequest.Builder computationBuilder = - computationBuilders.computeIfAbsent( - entry.computation, - k -> Windmill.ComputationGetDataRequest.newBuilder().setComputationId(k)); - - computationBuilder.addRequests(entry.request); - pendingResponses.put( - WindmillComputationKey.create( - entry.computation, entry.request.getKey(), entry.request.getShardingKey()), - entry.response); - } - - // Build the full GetDataRequest from the KeyedGetDataRequests pulled from the queue. - Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Windmill.ComputationGetDataRequest.Builder computationBuilder : - computationBuilders.values()) { - builder.addRequests(computationBuilder); - } - - try { - Windmill.GetDataResponse response = server.getData(builder.build()); - - // Dispatch the per-key responses back to the waiting threads. - for (Windmill.ComputationGetDataResponse computationResponse : response.getDataList()) { - for (Windmill.KeyedGetDataResponse keyResponse : computationResponse.getDataList()) { - pendingResponses - .get( - WindmillComputationKey.create( - computationResponse.getComputationId(), - keyResponse.getKey(), - keyResponse.getShardingKey())) - .set(keyResponse); - } - } - } catch (RuntimeException e) { - // Fan the exception out to the reads. - for (QueueEntry entry : batch.reads) { - entry.response.setException(e); - } - } finally { - synchronized (this) { - assert (activeReadThreads >= 1); - if (pendingReadBatches.isEmpty()) { - activeReadThreads--; - } else { - // Notify the thread responsible for issuing the next batch read. - ReadBatch startBatch = pendingReadBatches.remove(0); - startBatch.startRead.set(true); - } - } - } - } - - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - gcThrashingMonitor.waitForResources("GetStateData"); - activeStateReads.getAndIncrement(); - - try { - if (useStreamingRequests) { - GetDataStream stream = getDataStreamPool.getStream(); - try { - return stream.requestKeyedData(computation, request); - } finally { - getDataStreamPool.releaseStream(stream); - } - } else { - SettableFuture response = SettableFuture.create(); - ReadBatch batch = addToReadBatch(new QueueEntry(computation, request, response)); - if (batch != null) { - issueReadBatch(batch); - } - return response.get(); - } - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - activeStateReads.getAndDecrement(); - } - } - - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - gcThrashingMonitor.waitForResources("GetSideInputData"); - activeSideInputs.getAndIncrement(); - try { - if (useStreamingRequests) { - GetDataStream stream = getDataStreamPool.getStream(); - try { - return stream.requestGlobalData(request); - } finally { - getDataStreamPool.releaseStream(stream); - } - } else { - return server - .getData( - Windmill.GetDataRequest.newBuilder().addGlobalDataFetchRequests(request).build()) - .getGlobalData(0); - } - } catch (Exception e) { - throw new RuntimeException("Failed to get side input: ", e); - } finally { - activeSideInputs.getAndDecrement(); - } - } - - /** Tells windmill processing is ongoing for the given keys. */ - public void refreshActiveWork(Map> heartbeats) { - if (heartbeats.isEmpty()) { - return; - } - activeHeartbeats.set(heartbeats.size()); - try { - if (useStreamingRequests) { - GetDataStream stream = heartbeatStreamPool.getStream(); - try { - stream.refreshActiveWork(heartbeats); - } finally { - heartbeatStreamPool.releaseStream(stream); - } - } else { - // This code path is only used by appliance which sends heartbeats (used to refresh active - // work) as KeyedGetDataRequests. So we must translate the HeartbeatRequest to a - // KeyedGetDataRequest here regardless of the value of sendKeyedGetDataRequests. - Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Map.Entry> entry : heartbeats.entrySet()) { - Windmill.ComputationGetDataRequest.Builder perComputationBuilder = - Windmill.ComputationGetDataRequest.newBuilder(); - perComputationBuilder.setComputationId(entry.getKey()); - for (HeartbeatRequest request : entry.getValue()) { - perComputationBuilder.addRequests( - Windmill.KeyedGetDataRequest.newBuilder() - .setShardingKey(request.getShardingKey()) - .setWorkToken(request.getWorkToken()) - .setCacheToken(request.getCacheToken()) - .addAllLatencyAttribution(request.getLatencyAttributionList()) - .build()); - } - builder.addRequests(perComputationBuilder.build()); - } - server.getData(builder.build()); - } - } finally { - activeHeartbeats.set(0); - } - } - - public void printHtml(PrintWriter writer) { - writer.println("Active Fetches:"); - writer.println(" Side Inputs: " + activeSideInputs.get()); - writer.println(" State Reads: " + activeStateReads.get()); - if (!useStreamingRequests) { - synchronized (this) { - writer.println(" Read threads: " + activeReadThreads); - writer.println(" Pending read batches: " + pendingReadBatches.size()); - } - } - writer.println("Heartbeat Keys Active: " + activeHeartbeats.get()); - } - - private static final class ReadBatch { - ArrayList reads = new ArrayList<>(); - SettableFuture startRead = SettableFuture.create(); - } - - private static final class QueueEntry { - - final String computation; - final Windmill.KeyedGetDataRequest request; - final SettableFuture response; - - QueueEntry( - String computation, - Windmill.KeyedGetDataRequest request, - SettableFuture response) { - this.computation = computation; - this.request = request; - this.response = response; - } - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 0e46e7e4687ea..829396cd6dd78 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -72,6 +72,10 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.commits.StreamingApplianceWorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.StreamingEngineWorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ApplianceGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamingEngineGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.ChannelzServlet; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; @@ -158,7 +162,7 @@ public class StreamingDataflowWorker { private final AtomicBoolean running = new AtomicBoolean(); private final DataflowWorkerHarnessOptions options; private final long clientId; - private final MetricTrackingWindmillServerStub metricTrackingWindmillServer; + private final GetDataClient getDataClient; private final MemoryMonitor memoryMonitor; private final Thread memoryMonitorThread; private final ReaderCache readerCache; @@ -238,12 +242,16 @@ private StreamingDataflowWorker( dispatchThread.setName("DispatchThread"); this.clientId = clientId; this.windmillServer = windmillServer; - this.metricTrackingWindmillServer = - MetricTrackingWindmillServerStub.builder(windmillServer, memoryMonitor) - .setUseStreamingRequests(windmillServiceEnabled) - .setUseSeparateHeartbeatStreams(options.getUseSeparateWindmillHeartbeatStreams()) - .setNumGetDataStreams(options.getWindmillGetDataStreamCount()) - .build(); + + ThrottlingGetDataMetricTracker getDataMetricTracker = + new ThrottlingGetDataMetricTracker(memoryMonitor); + this.getDataClient = + windmillServiceEnabled + ? StreamingEngineGetDataClient.builder(windmillServer, getDataMetricTracker) + .setUseSeparateHeartbeatStreams(options.getUseSeparateWindmillHeartbeatStreams()) + .setNumGetDataStreams(options.getWindmillGetDataStreamCount()) + .build() + : ApplianceGetDataClient.create(windmillServer, getDataMetricTracker); // Register standard file systems. FileSystems.setDefaultPipelineOptions(options); @@ -259,7 +267,7 @@ private StreamingDataflowWorker( stuckCommitDurationMillis, computationStateCache::getAllPresentComputations, sampler, - metricTrackingWindmillServer::refreshActiveWork, + getDataClient::refreshActiveWork, executorSupplier.apply("RefreshWork")); WorkerStatusPages workerStatusPages = @@ -273,7 +281,7 @@ private StreamingDataflowWorker( .setStateCache(stateCache) .setComputationStateCache(computationStateCache) .setCurrentActiveCommitBytes(workCommitter::currentActiveCommitBytes) - .setGetDataStatusProvider(metricTrackingWindmillServer::printHtml) + .setGetDataStatusProvider(getDataClient::printHtml) .setWorkUnitExecutor(workUnitExecutor); this.statusPages = @@ -298,7 +306,7 @@ private StreamingDataflowWorker( mapTaskExecutorFactory, workUnitExecutor, stateCache::forComputation, - metricTrackingWindmillServer::getSideInputData, + getDataClient::getSideInputData, failureTracker, workFailureProcessor, streamingCounters, @@ -829,7 +837,7 @@ private void dispatchLoop() { workItem, watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), Work.createProcessingContext( - computationId, metricTrackingWindmillServer::getStateData, workCommitter::commit), + computationId, getDataClient::getStateData, workCommitter::commit), /* getWorkStreamLatencies= */ Collections.emptyList()); } } @@ -865,7 +873,7 @@ void streamingDispatchLoop() { .build(), Work.createProcessingContext( computationState.getComputationId(), - metricTrackingWindmillServer::getStateData, + getDataClient::getStateData, workCommitter::commit), getWorkStreamLatencies); })); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java new file mode 100644 index 0000000000000..2cd3748eb31be --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill; + +import org.apache.beam.sdk.annotations.Internal; + +/** Client for WindmillService via Streaming Appliance. */ +@Internal +public interface ApplianceWindmillClient { + /** Get a batch of work to process. */ + Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest request); + + /** Get additional data such as state needed to process work. */ + Windmill.GetDataResponse getData(Windmill.GetDataRequest request); + + /** Commit the work, issuing any output productions, state modifications etc. */ + Windmill.CommitWorkResponse commitWork(Windmill.CommitWorkRequest request); + + /** Get configuration data from the server. */ + Windmill.GetConfigResponse getConfig(Windmill.GetConfigRequest request); + + /** Report execution information to the server. */ + Windmill.ReportStatsResponse reportStats(Windmill.ReportStatsRequest request); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java new file mode 100644 index 0000000000000..e02e6c1123583 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill; + +import java.util.Set; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; + +/** Client for WindmillService via Streaming Engine. */ +@Internal +public interface StreamingEngineWindmillClient { + /** Returns the windmill service endpoints set by setWindmillServiceEndpoints */ + ImmutableSet getWindmillServiceEndpoints(); + + /** + * Sets the new endpoints used to talk to windmill. Upon first call, the stubs are initialized. On + * subsequent calls, if endpoints are different from previous values new stubs are created, + * replacing the previous ones. + */ + void setWindmillServiceEndpoints(Set endpoints); + + /** + * Gets work to process, returned as a stream. + * + *

Each time a WorkItem is received, it will be passed to the given receiver. The returned + * GetWorkStream object can be used to control the lifetime of the stream. + */ + WindmillStream.GetWorkStream getWorkStream( + Windmill.GetWorkRequest request, WorkItemReceiver receiver); + + /** Get additional data such as state needed to process work, returned as a stream. */ + WindmillStream.GetDataStream getDataStream(); + + /** Returns a stream allowing individual WorkItemCommitRequests to be streamed to Windmill. */ + WindmillStream.CommitWorkStream commitWorkStream(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java index 0785ae96626ee..5f7fd6da9d4b2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java @@ -59,11 +59,6 @@ public ImmutableSet getWindmillServiceEndpoints() { return ImmutableSet.of(); } - @Override - public boolean isReady() { - return true; - } - @Override public Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest workRequest) { try { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java index 7d0c4f5aba327..cd753cb8ec91f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java @@ -18,65 +18,11 @@ package org.apache.beam.runners.dataflow.worker.windmill; import java.io.PrintWriter; -import java.util.Set; import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; /** Stub for communicating with a Windmill server. */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public abstract class WindmillServerStub implements StatusDataProvider { - - /** - * Sets the new endpoints used to talk to windmill. Upon first call, the stubs are initialized. On - * subsequent calls, if endpoints are different from previous values new stubs are created, - * replacing the previous ones. - */ - public abstract void setWindmillServiceEndpoints(Set endpoints); - - /* - * Returns the windmill service endpoints set by setWindmillServiceEndpoints - */ - public abstract ImmutableSet getWindmillServiceEndpoints(); - - /** Returns true iff this WindmillServerStub is ready for making API calls. */ - public abstract boolean isReady(); - - /** Get a batch of work to process. */ - public abstract Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest request); - - /** Get additional data such as state needed to process work. */ - public abstract Windmill.GetDataResponse getData(Windmill.GetDataRequest request); - - /** Commit the work, issuing any output productions, state modifications etc. */ - public abstract Windmill.CommitWorkResponse commitWork(Windmill.CommitWorkRequest request); - - /** Get configuration data from the server. */ - public abstract Windmill.GetConfigResponse getConfig(Windmill.GetConfigRequest request); - - /** Report execution information to the server. */ - public abstract Windmill.ReportStatsResponse reportStats(Windmill.ReportStatsRequest request); - - /** - * Gets work to process, returned as a stream. - * - *

Each time a WorkItem is received, it will be passed to the given receiver. The returned - * GetWorkStream object can be used to control the lifetime of the stream. - */ - public abstract GetWorkStream getWorkStream( - Windmill.GetWorkRequest request, WorkItemReceiver receiver); - - /** Get additional data such as state needed to process work, returned as a stream. */ - public abstract GetDataStream getDataStream(); - - /** Returns a stream allowing individual WorkItemCommitRequests to be streamed to Windmill. */ - public abstract CommitWorkStream commitWorkStream(); +public abstract class WindmillServerStub + implements ApplianceWindmillClient, StreamingEngineWindmillClient, StatusDataProvider { /** Returns the amount of time the server has been throttled and resets the time to 0. */ public abstract long getAndResetThrottleTime(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java new file mode 100644 index 0000000000000..4cee027581e8e --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; +import org.apache.beam.runners.dataflow.worker.windmill.ApplianceWindmillClient; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** Appliance implementation of {@link GetDataClient}. */ +@Internal +@ThreadSafe +public final class ApplianceGetDataClient implements GetDataClient { + private static final int MAX_READS_PER_BATCH = 60; + private static final int MAX_ACTIVE_READS = 10; + + private final ApplianceWindmillClient windmillClient; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + @GuardedBy("this") + private final List pendingReadBatches; + + @GuardedBy("this") + private int activeReadThreads; + + private ApplianceGetDataClient( + ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { + this.windmillClient = windmillClient; + this.getDataMetricTracker = getDataMetricTracker; + this.pendingReadBatches = new ArrayList<>(); + this.activeReadThreads = 0; + } + + private static List convertToKeyedGetDataRequests( + List heartbeats) { + return heartbeats.stream() + .map( + request -> + Windmill.KeyedGetDataRequest.newBuilder() + .setShardingKey(request.getShardingKey()) + .setWorkToken(request.getWorkToken()) + .setCacheToken(request.getCacheToken()) + .addAllLatencyAttribution(request.getLatencyAttributionList()) + .build()) + .collect(Collectors.toList()); + } + + public static GetDataClient create( + ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { + return new ApplianceGetDataClient(windmillClient, getDataMetricTracker); + } + + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE)) { + SettableFuture response = SettableFuture.create(); + ReadBatch batch = addToReadBatch(new QueueEntry(computation, request, response)); + if (batch != null) { + issueReadBatch(batch); + } + return response.get(); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching state for computation=" + + computation + + ", key=" + + request.getShardingKey(), + e); + } + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE)) { + return windmillClient + .getData(Windmill.GetDataRequest.newBuilder().addGlobalDataFetchRequests(request).build()) + .getGlobalData(0); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } + + /** + * Appliance sends heartbeats (used to refresh active work) as KeyedGetDataRequests. So we must + * translate the HeartbeatRequest to a KeyedGetDataRequest. + */ + @Override + public void refreshActiveWork(Map> heartbeats) { + if (heartbeats.isEmpty()) { + return; + } + + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeats.size())) { + List requests = + heartbeats.entrySet().stream() + .map( + entry -> + ComputationGetDataRequest.newBuilder() + .setComputationId(entry.getKey()) + .addAllRequests(convertToKeyedGetDataRequests(entry.getValue())) + .build()) + .collect(Collectors.toList()); + windmillClient.getData(Windmill.GetDataRequest.newBuilder().addAllRequests(requests).build()); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeats, e); + } + } + + @Override + public synchronized void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + writer.println(" Read threads: " + activeReadThreads); + writer.println(" Pending read batches: " + pendingReadBatches.size()); + } + + private void issueReadBatch(ReadBatch batch) { + try { + Preconditions.checkState(batch.startRead.get()); + } catch (InterruptedException e) { + // We don't expect this thread to be interrupted. To simplify handling, we just fall through + // to issuing the call. + assert (false); + Thread.currentThread().interrupt(); + } catch (ExecutionException e) { + // startRead is a SettableFuture so this should never occur. + throw new AssertionError("Should not have exception on startRead", e); + } + Map> pendingResponses = + new HashMap<>(batch.reads.size()); + Map computationBuilders = new HashMap<>(); + for (QueueEntry entry : batch.reads) { + ComputationGetDataRequest.Builder computationBuilder = + computationBuilders.computeIfAbsent( + entry.computation, k -> ComputationGetDataRequest.newBuilder().setComputationId(k)); + + computationBuilder.addRequests(entry.request); + pendingResponses.put( + WindmillComputationKey.create( + entry.computation, entry.request.getKey(), entry.request.getShardingKey()), + entry.response); + } + + // Build the full GetDataRequest from the KeyedGetDataRequests pulled from the queue. + Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); + for (ComputationGetDataRequest.Builder computationBuilder : computationBuilders.values()) { + builder.addRequests(computationBuilder); + } + + try { + Windmill.GetDataResponse response = windmillClient.getData(builder.build()); + // Dispatch the per-key responses back to the waiting threads. + for (Windmill.ComputationGetDataResponse computationResponse : response.getDataList()) { + for (Windmill.KeyedGetDataResponse keyResponse : computationResponse.getDataList()) { + pendingResponses + .get( + WindmillComputationKey.create( + computationResponse.getComputationId(), + keyResponse.getKey(), + keyResponse.getShardingKey())) + .set(keyResponse); + } + } + } catch (RuntimeException e) { + // Fan the exception out to the reads. + for (QueueEntry entry : batch.reads) { + entry.response.setException(e); + } + } finally { + synchronized (this) { + Preconditions.checkState(activeReadThreads >= 1); + if (pendingReadBatches.isEmpty()) { + activeReadThreads--; + } else { + // Notify the thread responsible for issuing the next batch read. + ReadBatch startBatch = pendingReadBatches.remove(0); + startBatch.startRead.set(true); + } + } + } + } + + /** + * Adds the entry to a read batch for sending to the windmill server. If a non-null batch is + * returned, this thread will be responsible for sending the batch and should wait for the batch + * startRead to be notified. If null is returned, the entry was added to a read batch that will be + * issued by another thread. + */ + private @Nullable ReadBatch addToReadBatch(QueueEntry entry) { + synchronized (this) { + ReadBatch batch; + if (activeReadThreads < MAX_ACTIVE_READS) { + assert (pendingReadBatches.isEmpty()); + activeReadThreads += 1; + // fall through to below synchronized block + } else if (pendingReadBatches.isEmpty() + || pendingReadBatches.get(pendingReadBatches.size() - 1).reads.size() + >= MAX_READS_PER_BATCH) { + // This is the first read of a batch, it will be responsible for sending the batch. + batch = new ReadBatch(); + pendingReadBatches.add(batch); + batch.reads.add(entry); + return batch; + } else { + // This fits within an existing batch, it will be sent by the first blocking thread in the + // batch. + pendingReadBatches.get(pendingReadBatches.size() - 1).reads.add(entry); + return null; + } + } + ReadBatch batch = new ReadBatch(); + batch.reads.add(entry); + batch.startRead.set(true); + return batch; + } + + private static final class ReadBatch { + ArrayList reads = new ArrayList<>(); + SettableFuture startRead = SettableFuture.create(); + } + + private static final class QueueEntry { + final String computation; + final Windmill.KeyedGetDataRequest request; + final SettableFuture response; + + QueueEntry( + String computation, + Windmill.KeyedGetDataRequest request, + SettableFuture response) { + this.computation = computation; + this.request = request; + this.response = response; + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java new file mode 100644 index 0000000000000..8e311f8b98558 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import java.util.List; +import java.util.Map; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; +import org.apache.beam.sdk.annotations.Internal; + +/** Client for streaming backend GetData API. */ +@Internal +public interface GetDataClient { + KeyedGetDataResponse getStateData(String computation, KeyedGetDataRequest request); + + GlobalData getSideInputData(GlobalDataRequest request); + + void refreshActiveWork(Map> heartbeats); + + default void printHtml(PrintWriter writer) {} + + class GetDataException extends RuntimeException { + protected GetDataException(String message, Throwable cause) { + super(message, cause); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java new file mode 100644 index 0000000000000..bda007584b2dd --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import com.google.auto.value.AutoBuilder; +import java.io.PrintWriter; +import java.util.List; +import java.util.Map; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.StreamingEngineWindmillClient; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.sdk.annotations.Internal; +import org.joda.time.Duration; + +/** + * StreamingEngine implementation of {@link GetDataClient}. + * + * @implNote Uses {@link WindmillStreamPool} to send/receive requests. Depending on options, may use + * a dedicated stream pool for heartbeats. + */ +@Internal +@ThreadSafe +public final class StreamingEngineGetDataClient implements GetDataClient { + private static final Duration STREAM_TIMEOUT = Duration.standardSeconds(30); + + private final WindmillStreamPool getDataStreamPool; + private final WindmillStreamPool heartbeatStreamPool; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + StreamingEngineGetDataClient( + StreamingEngineWindmillClient windmillClient, + ThrottlingGetDataMetricTracker getDataMetricTracker, + boolean useSeparateHeartbeatStreams, + int numGetDataStreams) { + this.getDataMetricTracker = getDataMetricTracker; + this.getDataStreamPool = + WindmillStreamPool.create( + Math.max(1, numGetDataStreams), STREAM_TIMEOUT, windmillClient::getDataStream); + if (useSeparateHeartbeatStreams) { + this.heartbeatStreamPool = + WindmillStreamPool.create(1, STREAM_TIMEOUT, windmillClient::getDataStream); + } else { + this.heartbeatStreamPool = this.getDataStreamPool; + } + } + + public static Builder builder( + StreamingEngineWindmillClient windmillClient, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + return new AutoBuilder_StreamingEngineGetDataClient_Builder() + .setWindmillClient(windmillClient) + .setGetDataMetricTracker(getDataMetricTracker) + .setUseSeparateHeartbeatStreams(false) + .setNumGetDataStreams(1); + } + + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, KeyedGetDataRequest request) { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE); + CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { + return closeableStream.stream().requestKeyedData(computation, request); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching state for computation=" + + computation + + ", key=" + + request.getShardingKey(), + e); + } + } + + @Override + public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE); + CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { + return closeableStream.stream().requestGlobalData(request); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } + + @Override + public void refreshActiveWork(Map> heartbeats) { + if (heartbeats.isEmpty()) { + return; + } + + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE); + CloseableStream closeableStream = heartbeatStreamPool.getCloseableStream()) { + closeableStream.stream().refreshActiveWork(heartbeats); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeats, e); + } + } + + @Override + public void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + } + + @Internal + @AutoBuilder + public abstract static class Builder { + abstract Builder setWindmillClient(StreamingEngineWindmillClient windmillClient); + + abstract Builder setGetDataMetricTracker(ThrottlingGetDataMetricTracker getDataMetricTracker); + + public abstract Builder setUseSeparateHeartbeatStreams(boolean useSeparateHeartbeatStreams); + + public abstract Builder setNumGetDataStreams(int numGetDataStreams); + + abstract StreamingEngineGetDataClient autoBuild(); + + public final GetDataClient build() { + return autoBuild(); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java new file mode 100644 index 0000000000000..bc462df94bf7d --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import com.google.auto.value.AutoValue; +import java.io.PrintWriter; +import java.util.concurrent.atomic.AtomicInteger; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; + +/** + * Wraps GetData calls that tracks metrics for the number of in-flight requests and throttles + * requests when memory pressure is high. + */ +@Internal +@ThreadSafe +public final class ThrottlingGetDataMetricTracker { + private final MemoryMonitor gcThrashingMonitor; + private final GetDataMetrics getDataMetrics; + + public ThrottlingGetDataMetricTracker(MemoryMonitor gcThrashingMonitor) { + this.gcThrashingMonitor = gcThrashingMonitor; + this.getDataMetrics = GetDataMetrics.create(); + } + + /** + * Tracks a GetData call. If there is memory pressure, may throttle requests. Returns an {@link + * AutoCloseable} that will decrement the metric after the call is finished. + */ + public AutoCloseable trackSingleCallWithThrottling(Type callType) { + gcThrashingMonitor.waitForResources(callType.debugName); + AtomicInteger getDataMetricTracker = getDataMetrics.getMetricFor(callType); + getDataMetricTracker.getAndIncrement(); + return getDataMetricTracker::getAndDecrement; + } + + /** + * Tracks heartbeat request metrics. Returns an {@link AutoCloseable} that will decrement the + * metric after the call is finished. + */ + public AutoCloseable trackHeartbeats(int numHeartbeats) { + getDataMetrics + .activeHeartbeats() + .getAndUpdate(currentActiveHeartbeats -> currentActiveHeartbeats + numHeartbeats); + // Active heartbeats should never drop below 0. + return () -> + getDataMetrics + .activeHeartbeats() + .getAndUpdate(existing -> Math.max(existing - numHeartbeats, 0)); + } + + public void printHtml(PrintWriter writer) { + writer.println("Active Fetches:"); + getDataMetrics.printMetrics(writer); + } + + @VisibleForTesting + GetDataMetrics.ReadOnlySnapshot getMetricsSnapshot() { + return getDataMetrics.snapshot(); + } + + public enum Type { + STATE("GetStateData"), + SIDE_INPUT("GetSideInputData"), + HEARTBEAT("RefreshActiveWork"); + private final String debugName; + + Type(String debugName) { + this.debugName = debugName; + } + + public final String debugName() { + return debugName; + } + } + + @AutoValue + abstract static class GetDataMetrics { + private static GetDataMetrics create() { + return new AutoValue_ThrottlingGetDataMetricTracker_GetDataMetrics( + new AtomicInteger(), new AtomicInteger(), new AtomicInteger()); + } + + abstract AtomicInteger activeSideInputs(); + + abstract AtomicInteger activeStateReads(); + + abstract AtomicInteger activeHeartbeats(); + + private ReadOnlySnapshot snapshot() { + return ReadOnlySnapshot.create( + activeSideInputs().get(), activeStateReads().get(), activeHeartbeats().get()); + } + + private AtomicInteger getMetricFor(Type callType) { + switch (callType) { + case STATE: + return activeStateReads(); + case SIDE_INPUT: + return activeSideInputs(); + case HEARTBEAT: + return activeHeartbeats(); + + default: + // Should never happen, switch is exhaustive. + throw new IllegalStateException("Unsupported CallType=" + callType); + } + } + + private void printMetrics(PrintWriter writer) { + writer.println(" Side Inputs: " + activeSideInputs().get()); + writer.println(" State Reads: " + activeStateReads().get()); + writer.println("Heartbeat Keys Active: " + activeHeartbeats().get()); + } + + @AutoValue + abstract static class ReadOnlySnapshot { + + private static ReadOnlySnapshot create( + int activeSideInputs, int activeStateReads, int activeHeartbeats) { + return new AutoValue_ThrottlingGetDataMetricTracker_GetDataMetrics_ReadOnlySnapshot( + activeSideInputs, activeStateReads, activeHeartbeats); + } + + abstract int activeSideInputs(); + + abstract int activeStateReads(); + + abstract int activeHeartbeats(); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java index 0ab03a803180d..1fce4d238b2e7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java @@ -254,11 +254,6 @@ public void setWindmillServiceEndpoints(Set endpoints) { dispatcherClient.consumeWindmillDispatcherEndpoints(ImmutableSet.copyOf(endpoints)); } - @Override - public boolean isReady() { - return dispatcherClient.hasInitializedEndpoints(); - } - private synchronized void initializeLocalHost(int port) { this.maxBackoff = Duration.millis(500); if (options.isEnableStreamingEngine()) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index 127d46b7caf61..7514c526bbd16 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -91,7 +91,6 @@ public final class FakeWindmillServer extends WindmillServerStub { private final ConcurrentHashMap> droppedStreamingCommits; private int commitsRequested = 0; private final List getDataRequests = new ArrayList<>(); - private boolean isReady = true; private boolean dropStreamingCommits = false; private final Consumer> processHeartbeatResponses; @@ -523,27 +522,13 @@ public ArrayList getStatsReceived() { } @Override - public void setWindmillServiceEndpoints(Set endpoints) { - synchronized (this) { - this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); - isReady = true; - } + public synchronized void setWindmillServiceEndpoints(Set endpoints) { + this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); } @Override - public ImmutableSet getWindmillServiceEndpoints() { - synchronized (this) { - return dispatcherEndpoints; - } - } - - @Override - public boolean isReady() { - return isReady; - } - - public void setIsReady(boolean ready) { - this.isReady = ready; + public synchronized ImmutableSet getWindmillServiceEndpoints() { + return dispatcherEndpoints; } public static class ResponseQueue { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 52bc61e59919d..17430df611ce7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -888,7 +888,6 @@ private void runTestBasic(int numCommitThreads) throws Exception { makeSourceInstruction(StringUtf8Coder.of()), makeSinkInstruction(StringUtf8Coder.of(), 0)); - server.setIsReady(false); StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); @@ -936,8 +935,6 @@ public void testHotKeyLogging() throws Exception { makeSourceInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())), makeSinkInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), 0)); - server.setIsReady(false); - StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); @@ -975,8 +972,6 @@ public void testHotKeyLoggingNotEnabled() throws Exception { makeSourceInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())), makeSinkInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), 0)); - server.setIsReady(false); - StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java index ad2ac6baeabbc..c27815500ed26 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java @@ -33,8 +33,8 @@ import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; -import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.ListCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; @@ -70,7 +70,7 @@ public class SideInputStateFetcherTest { @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private static final String STATE_FAMILY = "state"; - @Mock private MetricTrackingWindmillServerStub server; + @Mock private GetDataClient server; @Mock private Supplier readStateSupplier; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java new file mode 100644 index 0000000000000..b19e7f06896cb --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertFalse; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.mock; + +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker.Type; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +@SuppressWarnings("FutureReturnValueIgnored") +public class ThrottlingGetDataMetricTrackerTest { + + private final MemoryMonitor memoryMonitor = mock(MemoryMonitor.class); + private final ThrottlingGetDataMetricTracker getDataMetricTracker = + new ThrottlingGetDataMetricTracker(memoryMonitor); + private final ExecutorService getDataProcessor = Executors.newCachedThreadPool(); + + @Test + public void testTrackSingleCallWithThrottling_STATE() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(eq(Type.STATE.debugName())); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(0); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testTrackSingleCallWithThrottling_SIDE_INPUT() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(eq(Type.SIDE_INPUT.debugName())); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(Type.SIDE_INPUT)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(1); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testTrackSingleCallWithThrottling_HEARTBEAT() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(eq(Type.HEARTBEAT.debugName())); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(Type.HEARTBEAT)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(1); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(0); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testTrackSingleCall_multipleThreads() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); + // Issuing 5 calls (1 from each thread) + // 2 State Reads + // 2 SideInput Reads + // 1 Heartbeat + List callTypes = + Lists.newArrayList( + Type.STATE, Type.SIDE_INPUT, Type.STATE, Type.HEARTBEAT, Type.SIDE_INPUT); + CountDownLatch processCall = new CountDownLatch(callTypes.size()); + CountDownLatch callProcessing = new CountDownLatch(callTypes.size()); + CountDownLatch processingDone = new CountDownLatch(callTypes.size()); + for (Type callType : callTypes) { + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(callType)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + } + + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + // Asserting that metrics reflects: + // 2 State Reads + // 2 SideInput Reads + // 1 Heartbeat + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(2); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(2); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(1); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + for (int i = 0; i < callTypes.size(); i++) { + processCall.countDown(); + } + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testThrottledTrackSingleCallWithThrottling() throws InterruptedException { + CountDownLatch mockThrottler = simulateMemoryPressure(); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + assertFalse(callProcessing.await(10, TimeUnit.MILLISECONDS)); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsBeforeProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsBeforeProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsBeforeProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsBeforeProcessing.activeSideInputs()).isEqualTo(0); + + // Stop throttling. + mockThrottler.countDown(); + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + } + + @Test + public void testTrackSingleCall_exceptionThrown() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch beforeException = new CountDownLatch(1); + CountDownLatch afterException = new CountDownLatch(1); + + // Catch the exception outside the try-with-resources block to ensure that + // AutoCloseable.closed() runs in the midst of an exception. + getDataProcessor.submit( + () -> { + try { + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + callProcessing.countDown(); + beforeException.await(); + throw new RuntimeException("something bad happened"); + } + } catch (RuntimeException e) { + afterException.countDown(); + throw e; + } + }); + + callProcessing.await(); + + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + beforeException.countDown(); + + // In the midst of an exception, close() should still run. + afterException.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + } + + @Test + public void testTrackHeartbeats() throws InterruptedException { + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + int numHeartbeats = 5; + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(numHeartbeats)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(5); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + } + + @Test + public void testTrackHeartbeats_exceptionThrown() throws InterruptedException { + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch beforeException = new CountDownLatch(1); + CountDownLatch afterException = new CountDownLatch(1); + int numHeartbeats = 10; + // Catch the exception outside the try-with-resources block to ensure that + // AutoCloseable.closed() runs in the midst of an exception. + getDataProcessor.submit( + () -> { + try { + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(numHeartbeats)) { + callProcessing.countDown(); + beforeException.await(); + throw new RuntimeException("something bad happened"); + } + } catch (RuntimeException e) { + afterException.countDown(); + throw e; + } + }); + + callProcessing.await(); + + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(numHeartbeats); + beforeException.countDown(); + + // In the midst of an exception, close() should still run. + afterException.await(); + ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + } + + /** Have the memory monitor block when waitForResources is called simulating memory pressure. */ + private CountDownLatch simulateMemoryPressure() { + CountDownLatch mockThrottler = new CountDownLatch(1); + doAnswer( + invocationOnMock -> { + mockThrottler.await(); + return null; + }) + .when(memoryMonitor) + .waitForResources(anyString()); + return mockThrottler; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java index 3460fc4cab922..8dbfc35192b7d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java @@ -35,13 +35,13 @@ import java.util.Optional; import java.util.concurrent.Future; import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; -import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; import org.apache.beam.runners.dataflow.worker.WindmillStateTestUtils; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListEntry; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListRange; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; @@ -97,7 +97,7 @@ private static void assertNoReader(Object obj) throws Exception { WindmillStateTestUtils.assertNoReference(obj, WindmillStateReader.class); } - @Mock private MetricTrackingWindmillServerStub mockWindmill; + @Mock private GetDataClient mockWindmill; private WindmillStateReader underTest; From 4ba59e39d20c47e54d27fee53b0099b02a767f55 Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Fri, 5 Jul 2024 02:11:08 -0700 Subject: [PATCH 2/7] add HeartbeatSender interface --- .../worker/StreamingDataflowWorker.java | 58 +++-- .../worker/streaming/ActiveWorkState.java | 17 ++ .../worker/streaming/ComputationState.java | 10 +- .../worker/streaming/ExecutableWork.java | 2 +- .../worker/streaming/RefreshableWork.java | 41 ++++ .../dataflow/worker/streaming/Work.java | 192 ++++++++++++++-- .../dataflow/worker/streaming/WorkId.java | 4 +- .../worker/windmill/WindmillConnection.java | 6 +- .../client/AbstractWindmillStream.java | 42 +++- .../windmill/client/WindmillStream.java | 56 ++++- .../commits/StreamingEngineWorkCommitter.java | 23 +- .../getdata/ApplianceGetDataClient.java | 46 ++-- .../client/getdata/DirectGetDataClient.java | 92 ++++++++ .../getdata/FanOutWorkRefreshClient.java | 80 +++++++ .../client/getdata/GetDataClient.java | 9 +- .../getdata/StreamingEngineGetDataClient.java | 73 ++----- .../client/getdata/WorkRefreshClient.java | 27 +++ .../client/grpc/GrpcCommitWorkStream.java | 6 +- .../client/grpc/GrpcDirectGetWorkStream.java | 16 +- .../client/grpc/GrpcGetDataStream.java | 10 +- .../client/grpc/GrpcGetWorkStream.java | 10 +- .../grpc/GrpcGetWorkerMetadataStream.java | 3 +- .../grpc/GrpcWindmillStreamFactory.java | 10 +- .../client/grpc/StreamingEngineClient.java | 2 +- .../client/grpc/WindmillStreamSender.java | 14 +- .../StreamObserverCancelledException.java | 31 +++ .../work/refresh/ActiveWorkRefresher.java | 59 ++++- .../work/refresh/ActiveWorkRefreshers.java | 50 ----- .../refresh/ApplianceHeartbeatSender.java | 62 ++++++ .../work/refresh/DirectHeartbeatSender.java | 72 ++++++ .../DispatchedActiveWorkRefresher.java | 68 ------ .../windmill/work/refresh/Heartbeat.java | 40 ++++ .../work/refresh/HeartbeatRequests.java | 84 +++++++ .../work/refresh/HeartbeatSender.java | 28 +++ .../refresh/StreamPoolHeartbeatSender.java | 48 ++++ .../dataflow/worker/FakeWindmillServer.java | 71 ++++-- .../worker/StreamingDataflowWorkerTest.java | 9 +- .../StreamingModeExecutionContextTest.java | 10 +- .../worker/WorkerCustomSourcesTest.java | 7 +- .../worker/streaming/ActiveWorkStateTest.java | 4 +- .../streaming/ComputationStateCacheTest.java | 4 +- .../worker/util/BoundedQueueExecutorTest.java | 5 +- .../client/WindmillStreamPoolTest.java | 22 +- .../StreamingApplianceWorkCommitterTest.java | 10 +- .../StreamingEngineWorkCommitterTest.java | 23 +- .../client/grpc/WindmillStreamSenderTest.java | 39 ++-- .../EvenGetWorkBudgetDistributorTest.java | 3 +- .../failures/WorkFailureProcessorTest.java | 5 +- ...Test.java => ActiveWorkRefresherTest.java} | 74 ++++--- .../work/refresh/HeartbeatRequestsTest.java | 206 ++++++++++++++++++ 50 files changed, 1493 insertions(+), 390 deletions(-) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java rename runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/{DispatchedActiveWorkRefresherTest.java => ActiveWorkRefresherTest.java} (80%) create mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 829396cd6dd78..9ae2248afa1d5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -66,6 +66,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.appliance.JniWindmillApplianceServer; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.CompleteCommit; @@ -76,6 +77,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamingEngineGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.WorkRefreshClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.ChannelzServlet; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; @@ -91,7 +93,9 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.StreamingEngineFailureTracker; import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.WorkFailureProcessor; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefresher; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefreshers; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ApplianceHeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.StreamPoolHeartbeatSender; import org.apache.beam.sdk.fn.IdGenerator; import org.apache.beam.sdk.fn.IdGenerators; import org.apache.beam.sdk.fn.JvmInitializers; @@ -139,6 +143,7 @@ public class StreamingDataflowWorker { static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); private static final Logger LOG = LoggerFactory.getLogger(StreamingDataflowWorker.class); + private static final Duration GET_DATA_STREAM_TIMEOUT = Duration.standardSeconds(30); /** The idGenerator to generate unique id globally. */ private static final IdGenerator ID_GENERATOR = IdGenerators.decrementingLongs(); @@ -163,6 +168,7 @@ public class StreamingDataflowWorker { private final DataflowWorkerHarnessOptions options; private final long clientId; private final GetDataClient getDataClient; + private final WorkRefreshClient workRefreshClient; private final MemoryMonitor memoryMonitor; private final Thread memoryMonitorThread; private final ReaderCache readerCache; @@ -172,6 +178,7 @@ public class StreamingDataflowWorker { private final StreamingWorkerStatusReporter workerStatusReporter; private final StreamingCounters streamingCounters; private final StreamingWorkScheduler streamingWorkScheduler; + private final HeartbeatSender heartbeatSender; private StreamingDataflowWorker( WindmillServerStub windmillServer, @@ -245,13 +252,24 @@ private StreamingDataflowWorker( ThrottlingGetDataMetricTracker getDataMetricTracker = new ThrottlingGetDataMetricTracker(memoryMonitor); - this.getDataClient = - windmillServiceEnabled - ? StreamingEngineGetDataClient.builder(windmillServer, getDataMetricTracker) - .setUseSeparateHeartbeatStreams(options.getUseSeparateWindmillHeartbeatStreams()) - .setNumGetDataStreams(options.getWindmillGetDataStreamCount()) - .build() - : ApplianceGetDataClient.create(windmillServer, getDataMetricTracker); + + WindmillStreamPool getDataStreamPool = + WindmillStreamPool.create( + Math.max(1, options.getWindmillGetDataStreamCount()), + GET_DATA_STREAM_TIMEOUT, + windmillServer::getDataStream); + + if (windmillServiceEnabled) { + StreamingEngineGetDataClient streamingEngineGetDataClient = + new StreamingEngineGetDataClient(getDataMetricTracker, getDataStreamPool); + this.getDataClient = streamingEngineGetDataClient; + this.workRefreshClient = streamingEngineGetDataClient; + } else { + ApplianceGetDataClient applianceGetDataClient = + new ApplianceGetDataClient(windmillServer, getDataMetricTracker); + this.getDataClient = applianceGetDataClient; + this.workRefreshClient = applianceGetDataClient; + } // Register standard file systems. FileSystems.setDefaultPipelineOptions(options); @@ -260,15 +278,16 @@ private StreamingDataflowWorker( windmillServiceEnabled && options.getStuckCommitDurationMillis() > 0 ? options.getStuckCommitDurationMillis() : 0; + this.activeWorkRefresher = - ActiveWorkRefreshers.createDispatchedActiveWorkRefresher( + new ActiveWorkRefresher( clock, options.getActiveWorkRefreshPeriodMillis(), stuckCommitDurationMillis, computationStateCache::getAllPresentComputations, sampler, - getDataClient::refreshActiveWork, - executorSupplier.apply("RefreshWork")); + executorSupplier.apply("RefreshWork"), + workRefreshClient::refreshActiveWork); WorkerStatusPages workerStatusPages = WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); @@ -316,6 +335,15 @@ private StreamingDataflowWorker( ID_GENERATOR, stageInfoMap); + this.heartbeatSender = + options.isEnableStreamingEngine() + ? new StreamPoolHeartbeatSender( + options.getUseSeparateWindmillHeartbeatStreams() + ? WindmillStreamPool.create( + 1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream) + : getDataStreamPool) + : new ApplianceHeartbeatSender(windmillServer::getData); + LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); LOG.debug("WindmillServicePort: {}", options.getWindmillServicePort()); @@ -837,7 +865,10 @@ private void dispatchLoop() { workItem, watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), Work.createProcessingContext( - computationId, getDataClient::getStateData, workCommitter::commit), + computationId, + getDataClient::getStateData, + workCommitter::commit, + heartbeatSender), /* getWorkStreamLatencies= */ Collections.emptyList()); } } @@ -874,7 +905,8 @@ void streamingDispatchLoop() { Work.createProcessingContext( computationState.getComputationId(), getDataClient::getStateData, - workCommitter::commit), + workCommitter::commit, + heartbeatSender), getWorkStreamLatencies); })); try { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 3e226514d57ec..5b7e04269f440 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.streaming; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap.flatteningToImmutableListMultimap; import java.io.PrintWriter; import java.util.ArrayDeque; @@ -45,6 +46,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; import org.joda.time.Duration; @@ -219,6 +221,21 @@ synchronized void failWorkForKey(Multimap failedWork) { } } + /** + * Returns a read only view of current active work. + * + * @implNote Do not return a reference to the underlying workQueue as iterations over it will + * cause a {@link java.util.ConcurrentModificationException} as it is not a thread-safe data + * structure. + */ + synchronized ImmutableListMultimap getReadOnlyActiveWork() { + return activeWork.entrySet().stream() + .collect( + flatteningToImmutableListMultimap( + Entry::getKey, + e -> e.getValue().stream().map(ExecutableWork::work).map(Work::refreshableView))); + } + private void incrementActiveWorkBudget(Work work) { activeGetWorkBudget.updateAndGet( getWorkBudget -> getWorkBudget.apply(1, work.getWorkItem().getSerializedSize())); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java index 434e784847997..789dac7dfcf1e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java @@ -23,14 +23,12 @@ import java.util.Optional; import java.util.concurrent.ConcurrentLinkedQueue; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; import org.joda.time.Instant; @@ -147,10 +145,8 @@ private void forceExecute(ExecutableWork executableWork) { executor.forceExecute(executableWork, executableWork.work().getWorkItem().getSerializedSize()); } - /** Gets HeartbeatRequests for any work started before refreshDeadline. */ - public ImmutableList getKeyHeartbeats( - Instant refreshDeadline, DataflowExecutionStateSampler sampler) { - return activeWorkState.getKeyHeartbeats(refreshDeadline, sampler); + public ImmutableListMultimap currentActiveWorkReadOnly() { + return activeWorkState.getReadOnlyActiveWork(); } public GetWorkBudget getActiveWorkBudget() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java index bdf8a7814ea3f..db279f0666301 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java @@ -31,7 +31,7 @@ public static ExecutableWork create(Work work, Consumer executeWorkFn) { public abstract Work work(); - abstract Consumer executeWorkFn(); + public abstract Consumer executeWorkFn(); @Override public void run() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java new file mode 100644 index 0000000000000..ebbf1911ef4a4 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming; + +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.joda.time.Instant; + +/** View of {@link Work} that exposes an interface for work refreshing. */ +@Internal +public interface RefreshableWork { + + WorkId id(); + + boolean isRefreshable(Instant refreshDeadline); + + HeartbeatSender heartbeatSender(); + + ImmutableList getLatencyAttributions( + boolean isHeartbeat, DataflowExecutionStateSampler sampler); + + void setFailed(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index ed3f2671b40c0..1a0012f040eff 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -20,6 +20,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; import com.google.auto.value.AutoValue; +import com.google.common.base.Objects; import java.util.Arrays; import java.util.Collection; import java.util.EnumMap; @@ -31,6 +32,7 @@ import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; +import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Pair; import org.apache.beam.runners.dataflow.worker.ActiveMessageMetadata; @@ -46,10 +48,13 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.commits.Commit; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Represents the state of an attempt to process a {@link WorkItem} by executing user code. @@ -58,7 +63,9 @@ */ @NotThreadSafe @Internal -public final class Work { +public final class Work implements RefreshableWork { + private static final Logger LOG = LoggerFactory.getLogger(Work.class); + private final ShardedKey shardedKey; private final WorkItem workItem; private final ProcessingContext processingContext; @@ -68,28 +75,35 @@ public final class Work { private final Map totalDurationPerState; private final WorkId id; private final String latencyTrackingId; + private final Runnable onFailed; private TimedState currentState; private volatile boolean isFailed; private Work( + ShardedKey shardedKey, WorkItem workItem, - Watermarks watermarks, ProcessingContext processingContext, - Supplier clock) { - this.shardedKey = ShardedKey.create(workItem.getKey(), workItem.getShardingKey()); + Watermarks watermarks, + Supplier clock, + Instant startTime, + Map totalDurationPerState, + WorkId id, + String latencyTrackingId, + Runnable onFailed, + TimedState currentState, + boolean isFailed) { + this.shardedKey = shardedKey; this.workItem = workItem; - this.processingContext = processingContext; this.watermarks = watermarks; this.clock = clock; - this.startTime = clock.get(); - this.totalDurationPerState = new EnumMap<>(LatencyAttribution.State.class); - this.id = WorkId.of(workItem); - this.latencyTrackingId = - Long.toHexString(workItem.getShardingKey()) - + '-' - + Long.toHexString(workItem.getWorkToken()); - this.currentState = TimedState.initialState(startTime); - this.isFailed = false; + this.startTime = startTime; + this.totalDurationPerState = totalDurationPerState; + this.id = id; + this.latencyTrackingId = latencyTrackingId; + this.onFailed = onFailed; + this.currentState = currentState; + this.isFailed = isFailed; + this.processingContext = processingContext; } public static Work create( @@ -98,7 +112,21 @@ public static Work create( ProcessingContext processingContext, Supplier clock, Collection getWorkStreamLatencies) { - Work work = new Work(workItem, watermarks, processingContext, clock); + Instant startTime = clock.get(); + Work work = + new Work( + ShardedKey.create(workItem.getKey(), workItem.getShardingKey()), + workItem, + processingContext, + watermarks, + clock, + startTime, + new EnumMap<>(LatencyAttribution.State.class), + WorkId.of(workItem), + buildLatencyTrackingId(workItem), + () -> {}, + TimedState.initialState(startTime), + false); work.recordGetWorkStreamLatencies(getWorkStreamLatencies); return work; } @@ -106,8 +134,10 @@ public static Work create( public static ProcessingContext createProcessingContext( String computationId, BiFunction getKeyedDataFn, - Consumer workCommitter) { - return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter); + Consumer workCommitter, + HeartbeatSender heartbeatSender) { + return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter, heartbeatSender) + .build(); } private static LatencyAttribution.Builder createLatencyAttributionWithActiveLatencyBreakdown( @@ -147,6 +177,29 @@ private static LatencyAttribution.Builder createLatencyAttributionWithActiveLate return latencyAttribution; } + private static String buildLatencyTrackingId(WorkItem workItem) { + return Long.toHexString(workItem.getShardingKey()) + + '-' + + Long.toHexString(workItem.getWorkToken()); + } + + /** Returns a new {@link Work} instance with the same state and a different failure handler. */ + public Work withFailureHandler(Runnable onFailed) { + return new Work( + shardedKey, + workItem, + processingContext, + watermarks, + clock, + startTime, + totalDurationPerState, + id, + latencyTrackingId, + onFailed, + currentState, + isFailed); + } + public WorkItem getWorkItem() { return workItem; } @@ -180,8 +233,28 @@ public void setState(State state) { this.currentState = TimedState.create(state, now); } + @Override + public boolean isRefreshable(Instant refreshDeadline) { + return getStartTime().isBefore(refreshDeadline) && !isFailed; + } + + @Override + public HeartbeatSender heartbeatSender() { + return processingContext.heartbeatSender(); + } + + @Override public void setFailed() { + LOG.debug( + "Failing work: [computationId= " + + processingContext.computationId() + + ", key=" + + shardedKey + + ", workId=" + + id + + "]. The work will be retried and is not lost."); this.isFailed = true; + onFailed.run(); } public boolean isCommitPending() { @@ -205,6 +278,7 @@ public WindmillStateReader createWindmillStateReader() { return WindmillStateReader.forWork(this); } + @Override public WorkId id() { return id; } @@ -216,6 +290,7 @@ private void recordGetWorkStreamLatencies(Collection getWork } } + @Override public ImmutableList getLatencyAttributions( boolean isHeartbeat, DataflowExecutionStateSampler sampler) { return Arrays.stream(LatencyAttribution.State.values()) @@ -260,11 +335,55 @@ public boolean isFailed() { return isFailed; } + public String backendWorkerToken() { + return processingContext.backendWorkerToken(); + } + boolean isStuckCommittingAt(Instant stuckCommitDeadline) { return currentState.state() == Work.State.COMMITTING && currentState.startTime().isBefore(stuckCommitDeadline); } + /** Returns a view of this {@link Work} instance for work refreshing. */ + public RefreshableWork refreshableView() { + return this; + } + + @Override + public boolean equals(@Nullable Object o) { + if (o == null) return false; + if (this == o) return true; + if (!(o instanceof Work)) return false; + Work work = (Work) o; + return isFailed == work.isFailed + && Objects.equal(shardedKey, work.shardedKey) + && Objects.equal(workItem, work.workItem) + && Objects.equal(processingContext, work.processingContext) + && Objects.equal(watermarks, work.watermarks) + && Objects.equal(clock, work.clock) + && Objects.equal(startTime, work.startTime) + && Objects.equal(totalDurationPerState, work.totalDurationPerState) + && Objects.equal(id, work.id) + && Objects.equal(latencyTrackingId, work.latencyTrackingId) + && Objects.equal(currentState, work.currentState); + } + + @Override + public int hashCode() { + return Objects.hashCode( + shardedKey, + workItem, + processingContext, + watermarks, + clock, + startTime, + totalDurationPerState, + id, + latencyTrackingId, + currentState, + isFailed); + } + public enum State { QUEUED(LatencyAttribution.State.QUEUED), PROCESSING(LatencyAttribution.State.ACTIVE), @@ -311,17 +430,24 @@ private boolean isCommitPending() { @AutoValue public abstract static class ProcessingContext { + private static final String UNKNOWN_BACKEND_WORKER_TOKEN = ""; - private static ProcessingContext create( + private static ProcessingContext.Builder create( String computationId, BiFunction getKeyedDataFn, - Consumer workCommitter) { - return new AutoValue_Work_ProcessingContext( - computationId, - request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request)), - workCommitter); + Consumer workCommitter, + HeartbeatSender heartbeatSender) { + return new AutoValue_Work_ProcessingContext.Builder() + .setBackendWorkerToken(UNKNOWN_BACKEND_WORKER_TOKEN) + .setComputationId(computationId) + .setHeartbeatSender(heartbeatSender) + .setWorkCommitter(workCommitter) + .setKeyedDataFetcher( + request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request))); } + abstract String backendWorkerToken(); + /** Computation that the {@link Work} belongs to. */ public abstract String computationId(); @@ -334,5 +460,25 @@ private static ProcessingContext create( * {@link WorkItem}. */ public abstract Consumer workCommitter(); + + public abstract HeartbeatSender heartbeatSender(); + + public abstract Builder toBuilder(); + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setBackendWorkerToken(String value); + + abstract Builder setComputationId(String value); + + abstract Builder setKeyedDataFetcher( + Function> value); + + abstract Builder setWorkCommitter(Consumer value); + + abstract Builder setHeartbeatSender(HeartbeatSender value); + + public abstract ProcessingContext build(); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java index f8f8d1901914e..d4e7f05d255fa 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java @@ -41,9 +41,9 @@ public static WorkId of(Windmill.WorkItem workItem) { .build(); } - abstract long cacheToken(); + public abstract long cacheToken(); - abstract long workToken(); + public abstract long workToken(); @AutoValue.Builder public abstract static class Builder { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java index a20c2f02b269e..37afe4bb1cf6b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java @@ -50,13 +50,13 @@ public static Builder builder() { public abstract CloudWindmillServiceV1Alpha1Stub stub(); @AutoValue.Builder - abstract static class Builder { + public abstract static class Builder { abstract Builder setBackendWorkerToken(String backendWorkerToken); public abstract Builder setDirectEndpoint(WindmillServiceAddress value); - abstract Builder setStub(CloudWindmillServiceV1Alpha1Stub stub); + public abstract Builder setStub(CloudWindmillServiceV1Alpha1Stub stub); - abstract WindmillConnection build(); + public abstract WindmillConnection build(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index 028a5c2e1d4b6..f41514fd4d745 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -69,6 +69,7 @@ public abstract class AbstractWindmillStream implements Win protected static final int RPC_STREAM_CHUNK_SIZE = 2 << 20; private static final Logger LOG = LoggerFactory.getLogger(AbstractWindmillStream.class); protected final AtomicBoolean clientClosed; + private final AtomicBoolean isShutdown; private final AtomicLong lastSendTimeMs; private final Executor executor; private final BackOff backoff; @@ -85,18 +86,21 @@ public abstract class AbstractWindmillStream implements Win // Indicates if the current stream in requestObserver is closed by calling close() method private final AtomicBoolean streamClosed; private @Nullable StreamObserver requestObserver; + private final String backendWorkerToken; protected AbstractWindmillStream( Function, StreamObserver> clientFactory, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, - int logEveryNStreamFailures) { + int logEveryNStreamFailures, + String backendWorkerToken) { + this.backendWorkerToken = backendWorkerToken; this.executor = Executors.newSingleThreadExecutor( new ThreadFactoryBuilder() .setDaemon(true) - .setNameFormat("WindmillStream-thread") + .setNameFormat(createThreadName(streamType(), backendWorkerToken)) .build()); this.backoff = backoff; this.streamRegistry = streamRegistry; @@ -111,12 +115,19 @@ protected AbstractWindmillStream( this.lastErrorTime = new AtomicReference<>(); this.sleepUntil = new AtomicLong(); this.finishLatch = new CountDownLatch(1); + this.isShutdown = new AtomicBoolean(false); this.requestObserverSupplier = () -> streamObserverFactory.from( clientFactory, new AbstractWindmillStream.ResponseObserver()); } + private static String createThreadName(Type streamType, String backendWorkerToken) { + return !backendWorkerToken.isEmpty() + ? String.format("%s-%s-WindmillStream-thread", streamType.name(), backendWorkerToken) + : String.format("%s-WindmillStream-thread", streamType.name()); + } + private static long debugDuration(long nowMs, long startMs) { if (startMs <= 0) { return -1; @@ -255,6 +266,28 @@ public final Instant startTime() { return new Instant(startTimeMs.get()); } + @Override + public String backendWorkerToken() { + return backendWorkerToken; + } + + @Override + public void shutdown() { + if (isShutdown.compareAndSet(false, true)) { + close(); + } + } + + @Override + public boolean isShutdown() { + return isShutdown.get(); + } + + private void setLastError(String error) { + lastError.set(error); + lastErrorTime.set(DateTime.now()); + } + private class ResponseObserver implements StreamObserver { @Override @@ -337,9 +370,4 @@ private void onStreamFinished(@Nullable Throwable t) { executor.execute(AbstractWindmillStream.this::startStream); } } - - private void setLastError(String error) { - lastError.set(error); - lastErrorTime.set(DateTime.now()); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index d044e9300790b..5e79fe0a484e5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -32,6 +32,10 @@ /** Superclass for streams returned by streaming Windmill methods. */ @ThreadSafe public interface WindmillStream { + + /** An identifier for the backend worker where the stream is sending/receiving RPCs. */ + String backendWorkerToken(); + /** Indicates that no more requests will be sent. */ void close(); @@ -41,6 +45,24 @@ public interface WindmillStream { /** Returns when the stream was opened. */ Instant startTime(); + /** + * Shutdown the stream. There should be no further interactions with the stream once this has been + * called. + */ + void shutdown(); + + /** Reflects that {@link #shutdown()} was explicitly called. */ + boolean isShutdown(); + + Type streamType(); + + enum Type { + GET_WORKER_METADATA, + GET_WORK, + GET_DATA, + COMMIT_WORK, + } + /** Handle representing a stream of GetWork responses. */ @ThreadSafe interface GetWorkStream extends WindmillStream { @@ -49,6 +71,11 @@ interface GetWorkStream extends WindmillStream { /** Returns the remaining in-flight {@link GetWorkBudget}. */ GetWorkBudget remainingBudget(); + + @Override + default Type streamType() { + return Type.GET_WORK; + } } /** Interface for streaming GetDataRequests to Windmill. */ @@ -65,11 +92,27 @@ Windmill.KeyedGetDataResponse requestKeyedData( void refreshActiveWork(Map> heartbeats); void onHeartbeatResponse(List responses); + + @Override + default Type streamType() { + return Type.GET_DATA; + } } /** Interface for streaming CommitWorkRequests to Windmill. */ @ThreadSafe interface CommitWorkStream extends WindmillStream { + /** + * Returns a builder that can be used for sending requests. Each builder is not thread-safe but + * different builders for the same stream may be used simultaneously. + */ + CommitWorkStream.RequestBatcher batcher(); + + @Override + default Type streamType() { + return Type.COMMIT_WORK; + } + @NotThreadSafe interface RequestBatcher extends Closeable { /** @@ -92,15 +135,14 @@ default void close() { flush(); } } - - /** - * Returns a builder that can be used for sending requests. Each builder is not thread-safe but - * different builders for the same stream may be used simultaneously. - */ - RequestBatcher batcher(); } /** Interface for streaming GetWorkerMetadata requests to Windmill. */ @ThreadSafe - interface GetWorkerMetadataStream extends WindmillStream {} + interface GetWorkerMetadataStream extends WindmillStream { + @Override + default Type streamType() { + return Type.GET_WORKER_METADATA; + } + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java index ed4dcfa212f1d..911b6809c2429 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java @@ -20,6 +20,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.function.Supplier; @@ -52,6 +53,7 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { private final AtomicLong activeCommitBytes; private final Consumer onCommitComplete; private final int numCommitSenders; + private final AtomicBoolean isRunning; private StreamingEngineWorkCommitter( Supplier> commitWorkStreamFactory, @@ -72,6 +74,7 @@ private StreamingEngineWorkCommitter( this.activeCommitBytes = new AtomicLong(); this.onCommitComplete = onCommitComplete; this.numCommitSenders = numCommitSenders; + this.isRunning = new AtomicBoolean(false); } public static StreamingEngineWorkCommitter create( @@ -85,7 +88,7 @@ public static StreamingEngineWorkCommitter create( @Override @SuppressWarnings("FutureReturnValueIgnored") public void start() { - if (!commitSenders.isShutdown()) { + if (isRunning.compareAndSet(false, true) && !commitSenders.isShutdown()) { for (int i = 0; i < numCommitSenders; i++) { commitSenders.submit(this::streamingCommitLoop); } @@ -94,7 +97,16 @@ public void start() { @Override public void commit(Commit commit) { - commitQueue.put(commit); + if (commit.work().isFailed() || !isRunning.get()) { + LOG.debug( + "Trying to queue commit on shutdown, failing commit=[computationId={}, shardingKey={}, workId={} ].", + commit.computationId(), + commit.work().getShardedKey(), + commit.work().id()); + failCommit(commit); + } else { + commitQueue.put(commit); + } } @Override @@ -104,17 +116,17 @@ public long currentActiveCommitBytes() { @Override public void stop() { - if (!commitSenders.isTerminated()) { + if (isRunning.compareAndSet(true, false) && !commitSenders.isTerminated()) { commitSenders.shutdownNow(); try { commitSenders.awaitTermination(10, TimeUnit.SECONDS); } catch (InterruptedException e) { LOG.warn( - "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue", + "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue.", e); } + drainCommitQueue(); } - drainCommitQueue(); } private void drainCommitQueue() { @@ -144,6 +156,7 @@ private void streamingCommitLoop() { // Block until we have a commit or are shutting down. initialCommit = commitQueue.take(); } catch (InterruptedException e) { + Thread.currentThread().interrupt(); return; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java index 4cee027581e8e..66c5100810de9 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java @@ -23,14 +23,14 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; -import java.util.stream.Collectors; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; import org.apache.beam.runners.dataflow.worker.windmill.ApplianceWindmillClient; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; @@ -39,7 +39,7 @@ /** Appliance implementation of {@link GetDataClient}. */ @Internal @ThreadSafe -public final class ApplianceGetDataClient implements GetDataClient { +public final class ApplianceGetDataClient implements GetDataClient, WorkRefreshClient { private static final int MAX_READS_PER_BATCH = 60; private static final int MAX_ACTIVE_READS = 10; @@ -52,7 +52,7 @@ public final class ApplianceGetDataClient implements GetDataClient { @GuardedBy("this") private int activeReadThreads; - private ApplianceGetDataClient( + public ApplianceGetDataClient( ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { this.windmillClient = windmillClient; this.getDataMetricTracker = getDataMetricTracker; @@ -60,20 +60,6 @@ private ApplianceGetDataClient( this.activeReadThreads = 0; } - private static List convertToKeyedGetDataRequests( - List heartbeats) { - return heartbeats.stream() - .map( - request -> - Windmill.KeyedGetDataRequest.newBuilder() - .setShardingKey(request.getShardingKey()) - .setWorkToken(request.getWorkToken()) - .setCacheToken(request.getCacheToken()) - .addAllLatencyAttribution(request.getLatencyAttributionList()) - .build()) - .collect(Collectors.toList()); - } - public static GetDataClient create( ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { return new ApplianceGetDataClient(windmillClient, getDataMetricTracker); @@ -120,24 +106,20 @@ public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) * translate the HeartbeatRequest to a KeyedGetDataRequest. */ @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map heartbeats) { if (heartbeats.isEmpty()) { return; } - try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeats.size())) { - List requests = - heartbeats.entrySet().stream() - .map( - entry -> - ComputationGetDataRequest.newBuilder() - .setComputationId(entry.getKey()) - .addAllRequests(convertToKeyedGetDataRequests(entry.getValue())) - .build()) - .collect(Collectors.toList()); - windmillClient.getData(Windmill.GetDataRequest.newBuilder().addAllRequests(requests).build()); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeats, e); + for (Map.Entry heartbeatToSend : heartbeats.entrySet()) { + HeartbeatSender heartbeatSender = heartbeatToSend.getKey(); + try (AutoCloseable ignored = + getDataMetricTracker.trackHeartbeats( + heartbeatToSend.getValue().heartbeatRequests().size())) { + heartbeatSender.sendHeartbeats(heartbeatToSend.getValue()); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java new file mode 100644 index 0000000000000..d9490f8bc7532 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.util.function.Supplier; +import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.sdk.annotations.Internal; + +/** {@link GetDataClient} that fetches data directly from a specific {@link GetDataStream}. */ +@Internal +public final class DirectGetDataClient implements GetDataClient { + + private final GetDataStream directGetDataStream; + private final Supplier sideInputGetDataStream; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + private DirectGetDataClient( + GetDataStream directGetDataStream, + Supplier sideInputGetDataStream, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + this.directGetDataStream = directGetDataStream; + this.sideInputGetDataStream = sideInputGetDataStream; + this.getDataMetricTracker = getDataMetricTracker; + } + + public static GetDataClient create( + GetDataStream getDataStream, + Supplier sideInputGetDataStream, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + return new DirectGetDataClient(getDataStream, sideInputGetDataStream, getDataMetricTracker); + } + + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + if (directGetDataStream.isShutdown()) { + throw new WorkItemCancelledException(request.getShardingKey()); + } + + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.STATE)) { + return directGetDataStream.requestKeyedData(computation, request); + } catch (Exception e) { + if (directGetDataStream.isShutdown()) { + throw new WorkItemCancelledException(request.getShardingKey()); + } + + throw new GetDataException( + "Error occurred fetching state for computation=" + + computation + + ", key=" + + request.getShardingKey(), + e); + } + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + GetDataStream sideInputGetDataStream = this.sideInputGetDataStream.get(); + if (sideInputGetDataStream.isShutdown()) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId()); + } + + try (AutoCloseable ignored = + getDataMetricTracker.trackSingleCallWithThrottling( + ThrottlingGetDataMetricTracker.Type.SIDE_INPUT)) { + return sideInputGetDataStream.requestGlobalData(request); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java new file mode 100644 index 0000000000000..d38ae3120dbcf --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * {@link WorkRefreshClient} that fans out heartbeats to all {@link HeartbeatSender}(s) in parallel + * passed into {@link #refreshActiveWork(Map)} + */ +@Internal +public final class FanOutWorkRefreshClient implements WorkRefreshClient { + private static final String FAN_OUT_REFRESH_WORK_EXECUTOR_NAME = + "FanOutActiveWorkRefreshExecutor"; + + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + private final ExecutorService fanOutActiveWorkRefreshExecutor; + + public FanOutWorkRefreshClient(ThrottlingGetDataMetricTracker getDataMetricTracker) { + this.getDataMetricTracker = getDataMetricTracker; + this.fanOutActiveWorkRefreshExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder().setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME).build()); + } + + @Override + public void refreshActiveWork(Map heartbeats) { + List> fanOutRefreshActiveWork = new ArrayList<>(); + for (Map.Entry heartbeat : heartbeats.entrySet()) { + fanOutRefreshActiveWork.add(sendHeartbeatOnStreamFuture(heartbeat)); + } + + // Don't block until we kick off all the refresh active work RPCs. + @SuppressWarnings("rawtypes") + CompletableFuture parallelFanOutRefreshActiveWork = + CompletableFuture.allOf(fanOutRefreshActiveWork.toArray(new CompletableFuture[0])); + parallelFanOutRefreshActiveWork.join(); + } + + private CompletableFuture sendHeartbeatOnStreamFuture( + Map.Entry heartbeat) { + return CompletableFuture.runAsync( + () -> { + try (AutoCloseable ignored = + getDataMetricTracker.trackHeartbeats( + heartbeat.getValue().heartbeatRequests().size())) { + HeartbeatSender sender = heartbeat.getKey(); + Heartbeat heartbeats = heartbeat.getValue(); + sender.sendHeartbeats(heartbeats); + } catch (Exception e) { + throw new GetDataClient.GetDataException("Error refreshing heartbeats.", e); + } + }, + fanOutActiveWorkRefreshExecutor); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java index 8e311f8b98558..4577b29f8850f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java @@ -18,11 +18,8 @@ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; import java.io.PrintWriter; -import java.util.List; -import java.util.Map; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; import org.apache.beam.sdk.annotations.Internal; @@ -34,13 +31,15 @@ public interface GetDataClient { GlobalData getSideInputData(GlobalDataRequest request); - void refreshActiveWork(Map> heartbeats); - default void printHtml(PrintWriter writer) {} class GetDataException extends RuntimeException { protected GetDataException(String message, Throwable cause) { super(message, cause); } + + public GetDataException(String message) { + super(message); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java index bda007584b2dd..fe2758682f3c5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java @@ -17,20 +17,18 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; -import com.google.auto.value.AutoBuilder; import java.io.PrintWriter; -import java.util.List; import java.util.Map; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.windmill.StreamingEngineWindmillClient; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; -import org.joda.time.Duration; /** * StreamingEngine implementation of {@link GetDataClient}. @@ -40,38 +38,16 @@ */ @Internal @ThreadSafe -public final class StreamingEngineGetDataClient implements GetDataClient { - private static final Duration STREAM_TIMEOUT = Duration.standardSeconds(30); +public final class StreamingEngineGetDataClient implements GetDataClient, WorkRefreshClient { private final WindmillStreamPool getDataStreamPool; - private final WindmillStreamPool heartbeatStreamPool; private final ThrottlingGetDataMetricTracker getDataMetricTracker; - StreamingEngineGetDataClient( - StreamingEngineWindmillClient windmillClient, + public StreamingEngineGetDataClient( ThrottlingGetDataMetricTracker getDataMetricTracker, - boolean useSeparateHeartbeatStreams, - int numGetDataStreams) { + WindmillStreamPool getDataStreamPool) { this.getDataMetricTracker = getDataMetricTracker; - this.getDataStreamPool = - WindmillStreamPool.create( - Math.max(1, numGetDataStreams), STREAM_TIMEOUT, windmillClient::getDataStream); - if (useSeparateHeartbeatStreams) { - this.heartbeatStreamPool = - WindmillStreamPool.create(1, STREAM_TIMEOUT, windmillClient::getDataStream); - } else { - this.heartbeatStreamPool = this.getDataStreamPool; - } - } - - public static Builder builder( - StreamingEngineWindmillClient windmillClient, - ThrottlingGetDataMetricTracker getDataMetricTracker) { - return new AutoBuilder_StreamingEngineGetDataClient_Builder() - .setWindmillClient(windmillClient) - .setGetDataMetricTracker(getDataMetricTracker) - .setUseSeparateHeartbeatStreams(false) - .setNumGetDataStreams(1); + this.getDataStreamPool = getDataStreamPool; } @Override @@ -96,7 +72,7 @@ public Windmill.KeyedGetDataResponse getStateData( public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { try (AutoCloseable ignored = getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE); + ThrottlingGetDataMetricTracker.Type.SIDE_INPUT); CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { return closeableStream.stream().requestGlobalData(request); } catch (Exception e) { @@ -106,18 +82,19 @@ public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { } @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map heartbeats) { if (heartbeats.isEmpty()) { return; } - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE); - CloseableStream closeableStream = heartbeatStreamPool.getCloseableStream()) { - closeableStream.stream().refreshActiveWork(heartbeats); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeats, e); + for (Map.Entry heartbeatToSend : heartbeats.entrySet()) { + try (AutoCloseable ignored = + getDataMetricTracker.trackHeartbeats( + heartbeatToSend.getValue().heartbeatRequests().size())) { + heartbeatToSend.getKey().sendHeartbeats(heartbeatToSend.getValue()); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); + } } } @@ -125,22 +102,4 @@ public void refreshActiveWork(Map> heart public void printHtml(PrintWriter writer) { getDataMetricTracker.printHtml(writer); } - - @Internal - @AutoBuilder - public abstract static class Builder { - abstract Builder setWindmillClient(StreamingEngineWindmillClient windmillClient); - - abstract Builder setGetDataMetricTracker(ThrottlingGetDataMetricTracker getDataMetricTracker); - - public abstract Builder setUseSeparateHeartbeatStreams(boolean useSeparateHeartbeatStreams); - - public abstract Builder setNumGetDataStreams(int numGetDataStreams); - - abstract StreamingEngineGetDataClient autoBuild(); - - public final GetDataClient build() { - return autoBuild(); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java new file mode 100644 index 0000000000000..a5bcba9cf3da3 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.util.Map; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; + +/** Client for requesting work refresh via heartbeats. */ +public interface WorkRefreshClient { + void refreshActiveWork(Map heartbeats); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java index f9f579119d616..232461e34e633 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java @@ -57,6 +57,7 @@ public final class GrpcCommitWorkStream private final int streamingRpcBatchLimit; private GrpcCommitWorkStream( + String backendWorkerToken, Function, StreamObserver> startCommitWorkRpcFn, BackOff backoff, @@ -72,7 +73,8 @@ private GrpcCommitWorkStream( backoff, streamObserverFactory, streamRegistry, - logEveryNStreamFailures); + logEveryNStreamFailures, + backendWorkerToken); pending = new ConcurrentHashMap<>(); this.idGenerator = idGenerator; this.jobHeader = jobHeader; @@ -81,6 +83,7 @@ private GrpcCommitWorkStream( } public static GrpcCommitWorkStream create( + String backendWorkerToken, Function, StreamObserver> startCommitWorkRpcFn, BackOff backoff, @@ -93,6 +96,7 @@ public static GrpcCommitWorkStream create( int streamingRpcBatchLimit) { GrpcCommitWorkStream commitWorkStream = new GrpcCommitWorkStream( + backendWorkerToken, startCommitWorkRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 6f4b5b7b33fb7..20fbf2fb7619e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -43,6 +43,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.DirectHeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; @@ -91,6 +92,7 @@ public final class GrpcDirectGetWorkStream private final ConcurrentMap workItemBuffers; private GrpcDirectGetWorkStream( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -105,7 +107,12 @@ private GrpcDirectGetWorkStream( Supplier workCommitter, WorkItemScheduler workItemScheduler) { super( - startGetWorkRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + startGetWorkRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.workItemScheduler = workItemScheduler; @@ -120,6 +127,7 @@ private GrpcDirectGetWorkStream( } public static GrpcDirectGetWorkStream create( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -135,6 +143,7 @@ public static GrpcDirectGetWorkStream create( WorkItemScheduler workItemScheduler) { GrpcDirectGetWorkStream getWorkStream = new GrpcDirectGetWorkStream( + backendWorkerToken, startGetWorkRpcFn, request, backoff, @@ -327,7 +336,10 @@ private void runAndReset() { private Work.ProcessingContext createProcessingContext(String computationId) { return Work.createProcessingContext( - computationId, getDataStream.get()::requestKeyedData, workCommitter.get()::commit); + computationId, + getDataStream.get()::requestKeyedData, + workCommitter.get()::commit, + DirectHeartbeatSender.create(getDataStream.get())); } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java index feb15c2ac83cb..9f115ea26e813 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java @@ -75,6 +75,7 @@ public final class GrpcGetDataStream private final Consumer> processHeartbeatResponses; private GrpcGetDataStream( + String backendWorkerToken, Function, StreamObserver> startGetDataRpcFn, BackOff backoff, @@ -88,7 +89,12 @@ private GrpcGetDataStream( boolean sendKeyedGetDataRequests, Consumer> processHeartbeatResponses) { super( - startGetDataRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + startGetDataRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.idGenerator = idGenerator; this.getDataThrottleTimer = getDataThrottleTimer; this.jobHeader = jobHeader; @@ -100,6 +106,7 @@ private GrpcGetDataStream( } public static GrpcGetDataStream create( + String backendWorkerToken, Function, StreamObserver> startGetDataRpcFn, BackOff backoff, @@ -114,6 +121,7 @@ public static GrpcGetDataStream create( Consumer> processHeartbeatResponses) { GrpcGetDataStream getDataStream = new GrpcGetDataStream( + backendWorkerToken, startGetDataRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java index 867180fb0d31c..5fc093ee32aa9 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java @@ -60,6 +60,7 @@ public final class GrpcGetWorkStream private final AtomicLong inflightBytes; private GrpcGetWorkStream( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -72,7 +73,12 @@ private GrpcGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver receiver) { super( - startGetWorkRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + startGetWorkRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.receiver = receiver; @@ -82,6 +88,7 @@ private GrpcGetWorkStream( } public static GrpcGetWorkStream create( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -95,6 +102,7 @@ public static GrpcGetWorkStream create( WorkItemReceiver receiver) { GrpcGetWorkStream getWorkStream = new GrpcGetWorkStream( + backendWorkerToken, startGetWorkRpcFn, request, backoff, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java index 3672f02c813f2..6f734b7da9dcb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java @@ -69,7 +69,8 @@ private GrpcGetWorkerMetadataStream( backoff, streamObserverFactory, streamRegistry, - logEveryNStreamFailures); + logEveryNStreamFailures, + ""); this.workerMetadataRequest = WorkerMetadataRequest.newBuilder().setHeader(jobHeader).build(); this.metadataVersion = metadataVersion; this.getWorkerMetadataThrottleTimer = getWorkerMetadataThrottleTimer; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java index 14866f3f586be..1623dfcc7d6f2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java @@ -37,6 +37,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationHeartbeatResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints; import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; @@ -69,6 +70,7 @@ public class GrpcWindmillStreamFactory implements StatusDataProvider { private static final int DEFAULT_STREAMING_RPC_BATCH_LIMIT = Integer.MAX_VALUE; private static final int DEFAULT_WINDMILL_MESSAGES_BETWEEN_IS_READY_CHECKS = 1; private static final int NO_HEALTH_CHECKS = -1; + private static final String NO_BACKEND_WORKER_TOKEN = ""; private final JobHeader jobHeader; private final int logEveryNStreamFailures; @@ -179,6 +181,7 @@ public GetWorkStream createGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver processWorkItem) { return GrpcGetWorkStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).getWorkStream(responseObserver), request, grpcBackOff.get(), @@ -190,14 +193,15 @@ public GetWorkStream createGetWorkStream( } public GetWorkStream createDirectGetWorkStream( - CloudWindmillServiceV1Alpha1Stub stub, + WindmillConnection connection, GetWorkRequest request, ThrottleTimer getWorkThrottleTimer, Supplier getDataStream, Supplier workCommitter, WorkItemScheduler workItemScheduler) { return GrpcDirectGetWorkStream.create( - responseObserver -> withDefaultDeadline(stub).getWorkStream(responseObserver), + connection.backendWorkerToken().orElse(NO_BACKEND_WORKER_TOKEN), + responseObserver -> withDefaultDeadline(connection.stub()).getWorkStream(responseObserver), request, grpcBackOff.get(), newStreamObserverFactory(), @@ -212,6 +216,7 @@ public GetWorkStream createDirectGetWorkStream( public GetDataStream createGetDataStream( CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer getDataThrottleTimer) { return GrpcGetDataStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).getDataStream(responseObserver), grpcBackOff.get(), newStreamObserverFactory(), @@ -228,6 +233,7 @@ public GetDataStream createGetDataStream( public CommitWorkStream createCommitWorkStream( CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer commitWorkThrottleTimer) { return GrpcCommitWorkStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).commitWorkStream(responseObserver), grpcBackOff.get(), newStreamObserverFactory(), diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java index 4760062c5754a..66bebd0e939b4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java @@ -390,7 +390,7 @@ private WindmillStreamSender createAndStartWindmillStreamSenderFor( // GetWorkBudgetDistributor. WindmillStreamSender windmillStreamSender = WindmillStreamSender.create( - connection.stub(), + connection, GetWorkRequest.newBuilder() .setClientId(clientId) .setJobId(jobHeader.getJobId()) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java index e9f008eb522eb..16890e0b69f4a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java @@ -22,8 +22,8 @@ import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; @@ -65,7 +65,7 @@ public class WindmillStreamSender { private final StreamingEngineThrottleTimers streamingEngineThrottleTimers; private WindmillStreamSender( - CloudWindmillServiceV1Alpha1Stub stub, + WindmillConnection connection, GetWorkRequest getWorkRequest, AtomicReference getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, @@ -83,19 +83,19 @@ private WindmillStreamSender( Suppliers.memoize( () -> streamingEngineStreamFactory.createGetDataStream( - stub, streamingEngineThrottleTimers.getDataThrottleTimer())); + connection.stub(), streamingEngineThrottleTimers.getDataThrottleTimer())); this.commitWorkStream = Suppliers.memoize( () -> streamingEngineStreamFactory.createCommitWorkStream( - stub, streamingEngineThrottleTimers.commitWorkThrottleTimer())); + connection.stub(), streamingEngineThrottleTimers.commitWorkThrottleTimer())); this.workCommitter = Suppliers.memoize(() -> workCommitterFactory.apply(commitWorkStream.get())); this.getWorkStream = Suppliers.memoize( () -> streamingEngineStreamFactory.createDirectGetWorkStream( - stub, + connection, withRequestBudget(getWorkRequest, getWorkBudget.get()), streamingEngineThrottleTimers.getWorkThrottleTimer(), getDataStream, @@ -104,14 +104,14 @@ private WindmillStreamSender( } public static WindmillStreamSender create( - CloudWindmillServiceV1Alpha1Stub stub, + WindmillConnection connection, GetWorkRequest getWorkRequest, GetWorkBudget getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, WorkItemScheduler workItemScheduler, Function workCommitterFactory) { return new WindmillStreamSender( - stub, + connection, getWorkRequest, new AtomicReference<>(getWorkBudget), streamingEngineStreamFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java new file mode 100644 index 0000000000000..4ea209f31b1d9 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers; + +import org.apache.beam.sdk.annotations.Internal; + +@Internal +public final class StreamObserverCancelledException extends RuntimeException { + public StreamObserverCancelledException(Throwable cause) { + super(cause); + } + + public StreamObserverCancelledException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 96a6feec1da0d..11197c74d21f4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -17,13 +17,22 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; +import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -37,29 +46,33 @@ * threshold is determined by {@link #activeWorkRefreshPeriodMillis} */ @ThreadSafe -public abstract class ActiveWorkRefresher { +@Internal +public final class ActiveWorkRefresher { private static final Logger LOG = LoggerFactory.getLogger(ActiveWorkRefresher.class); - protected final Supplier clock; - protected final int activeWorkRefreshPeriodMillis; - protected final Supplier> computations; - protected final DataflowExecutionStateSampler sampler; + private final Supplier clock; + private final int activeWorkRefreshPeriodMillis; + private final Supplier> computations; + private final DataflowExecutionStateSampler sampler; private final int stuckCommitDurationMillis; private final ScheduledExecutorService activeWorkRefreshExecutor; + private final Consumer> heartbeatSender; - protected ActiveWorkRefresher( + public ActiveWorkRefresher( Supplier clock, int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, DataflowExecutionStateSampler sampler, - ScheduledExecutorService activeWorkRefreshExecutor) { + ScheduledExecutorService activeWorkRefreshExecutor, + Consumer> heartbeatSender) { this.clock = clock; this.activeWorkRefreshPeriodMillis = activeWorkRefreshPeriodMillis; this.stuckCommitDurationMillis = stuckCommitDurationMillis; this.computations = computations; this.sampler = sampler; this.activeWorkRefreshExecutor = activeWorkRefreshExecutor; + this.heartbeatSender = heartbeatSender; } @SuppressWarnings("FutureReturnValueIgnored") @@ -103,5 +116,35 @@ private void invalidateStuckCommits() { } } - protected abstract void refreshActiveWork(); + private void refreshActiveWork() { + Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); + + Map fannedOutHeartbeatRequests = new HashMap<>(); + + for (ComputationState computationState : computations.get()) { + String computationId = computationState.getComputationId(); + + // Get heartbeat requests for computation's current active work, aggregated by GetDataStream + // to correctly fan-out the heartbeat requests. + Table heartbeats = + HeartbeatRequests.getRefreshableKeyHeartbeats( + computationState.currentActiveWorkReadOnly(), refreshDeadline, sampler); + + // Aggregate the heartbeats across computations by GetDataStream for correct fan out. + for (Table.Cell heartbeatsPerStream : + heartbeats.cellSet()) { + Heartbeat heartbeat = + fannedOutHeartbeatRequests.computeIfAbsent( + heartbeatsPerStream.getRowKey(), ignored -> Heartbeat.create()); + heartbeat.work().add(heartbeatsPerStream.getColumnKey()); + List existingHeartbeatsForComputation = + heartbeat + .heartbeatRequests() + .computeIfAbsent(computationId, ignored -> new ArrayList<>()); + existingHeartbeatsForComputation.add(heartbeatsPerStream.getValue()); + } + } + + heartbeatSender.accept(fannedOutHeartbeatRequests); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java deleted file mode 100644 index 5a59a7f1ae01d..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ScheduledExecutorService; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; -import org.joda.time.Instant; - -/** Utility class for {@link ActiveWorkRefresher}. */ -public final class ActiveWorkRefreshers { - public static ActiveWorkRefresher createDispatchedActiveWorkRefresher( - Supplier clock, - int activeWorkRefreshPeriodMillis, - int stuckCommitDurationMillis, - Supplier> computations, - DataflowExecutionStateSampler sampler, - Consumer>> activeWorkRefresherFn, - ScheduledExecutorService scheduledExecutorService) { - return new DispatchedActiveWorkRefresher( - clock, - activeWorkRefreshPeriodMillis, - stuckCommitDurationMillis, - computations, - sampler, - activeWorkRefresherFn, - scheduledExecutorService); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java new file mode 100644 index 0000000000000..ab32108ae5bae --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.annotations.Internal; + +/** Streaming appliance implementation of {@link HeartbeatSender}. */ +@Internal +public final class ApplianceHeartbeatSender implements HeartbeatSender { + private final Consumer sendHeartbeatFn; + + public ApplianceHeartbeatSender(Consumer sendHeartbeatFn) { + this.sendHeartbeatFn = sendHeartbeatFn; + } + + /** + * Appliance which sends heartbeats (used to refresh active work) as KeyedGetDataRequests. So we + * must translate the HeartbeatRequest to a KeyedGetDataRequest here. + */ + @Override + public void sendHeartbeats(Heartbeat heartbeats) { + Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); + + for (Map.Entry> entry : + heartbeats.heartbeatRequests().entrySet()) { + Windmill.ComputationGetDataRequest.Builder perComputationBuilder = + Windmill.ComputationGetDataRequest.newBuilder(); + perComputationBuilder.setComputationId(entry.getKey()); + for (Windmill.HeartbeatRequest request : entry.getValue()) { + perComputationBuilder.addRequests( + Windmill.KeyedGetDataRequest.newBuilder() + .setShardingKey(request.getShardingKey()) + .setWorkToken(request.getWorkToken()) + .setCacheToken(request.getCacheToken()) + .addAllLatencyAttribution(request.getLatencyAttributionList()) + .build()); + } + builder.addRequests(perComputationBuilder.build()); + } + + sendHeartbeatFn.accept(builder.build()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java new file mode 100644 index 0000000000000..0c5b16ad1b894 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.sdk.annotations.Internal; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link HeartbeatSender} implementation that sends heartbeats directly on the underlying stream if + * the stream is not closed. + * + * @implNote + *

{@link #equals(Object)} and {@link #hashCode()} implementations delegate to internal + * {@link GetDataStream} implementations so that requests can be grouped and sent on the same + * stream instance. + */ +@Internal +public final class DirectHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(DirectHeartbeatSender.class); + private final GetDataStream getDataStream; + + private DirectHeartbeatSender(GetDataStream getDataStream) { + this.getDataStream = getDataStream; + } + + public static DirectHeartbeatSender create(GetDataStream getDataStream) { + return new DirectHeartbeatSender(getDataStream); + } + + @Override + public void sendHeartbeats(Heartbeat heartbeats) { + if (getDataStream.isShutdown()) { + LOG.warn( + "Trying to refresh work w/ {} heartbeats on stream={} after work has moved off of worker." + + " heartbeats", + getDataStream.backendWorkerToken(), + heartbeats.heartbeatRequests().size()); + heartbeats.work().forEach(RefreshableWork::setFailed); + } else { + getDataStream.refreshActiveWork(heartbeats.heartbeatRequests()); + } + } + + @Override + public int hashCode() { + return getDataStream.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return obj instanceof DirectHeartbeatSender + && getDataStream.equals(((DirectHeartbeatSender) obj).getDataStream); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java deleted file mode 100644 index f81233498fe32..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ScheduledExecutorService; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.joda.time.Duration; -import org.joda.time.Instant; - -final class DispatchedActiveWorkRefresher extends ActiveWorkRefresher { - - private final Consumer>> activeWorkRefresherFn; - - DispatchedActiveWorkRefresher( - Supplier clock, - int activeWorkRefreshPeriodMillis, - int stuckCommitDurationMillis, - Supplier> computations, - DataflowExecutionStateSampler sampler, - Consumer>> activeWorkRefresherFn, - ScheduledExecutorService scheduledExecutorService) { - super( - clock, - activeWorkRefreshPeriodMillis, - stuckCommitDurationMillis, - computations, - sampler, - scheduledExecutorService); - this.activeWorkRefresherFn = activeWorkRefresherFn; - } - - @Override - protected void refreshActiveWork() { - Map> heartbeats = new HashMap<>(); - Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); - - for (ComputationState computationState : computations.get()) { - heartbeats.put( - computationState.getComputationId(), - computationState.getKeyHeartbeats(refreshDeadline, sampler)); - } - - activeWorkRefresherFn.accept(heartbeats); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java new file mode 100644 index 0000000000000..8cc7d50dd48e6 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import com.google.auto.value.AutoValue; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; + +/** Heartbeat requests and the work that was used to generate the heartbeat requests. */ +@AutoValue +public abstract class Heartbeat { + + static Heartbeat create() { + return new AutoValue_Heartbeat(new HashSet<>(), new HashMap<>()); + } + + abstract Collection work(); + + public abstract Map> heartbeatRequests(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java new file mode 100644 index 0000000000000..792548f93b420 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableTable.toImmutableTable; + +import com.google.auto.value.AutoValue; +import java.util.Collection; +import java.util.Map; +import java.util.stream.Stream; +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.streaming.ShardedKey; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; +import org.joda.time.Instant; + +/** Helper factory class for creating heartbeat requests. */ +@Internal +public final class HeartbeatRequests { + + private HeartbeatRequests() {} + + static Table getRefreshableKeyHeartbeats( + ImmutableListMultimap activeWork, + Instant refreshDeadline, + DataflowExecutionStateSampler sampler) { + return activeWork.asMap().entrySet().stream() + .flatMap(e -> toHeartbeatRow(e, refreshDeadline, sampler)) + .collect(toImmutableTable(HeartbeatRow::sender, HeartbeatRow::work, HeartbeatRow::request)); + } + + private static Stream toHeartbeatRow( + Map.Entry> shardedKeyAndWorkQueue, + Instant refreshDeadline, + DataflowExecutionStateSampler sampler) { + ShardedKey shardedKey = shardedKeyAndWorkQueue.getKey(); + Collection workQueue = shardedKeyAndWorkQueue.getValue(); + return workQueue.stream() + .filter(work -> work.isRefreshable(refreshDeadline)) + .map(work -> HeartbeatRow.create(work, createHeartbeatRequest(shardedKey, work, sampler))); + } + + private static HeartbeatRequest createHeartbeatRequest( + ShardedKey shardedKey, RefreshableWork work, DataflowExecutionStateSampler sampler) { + return HeartbeatRequest.newBuilder() + .setShardingKey(shardedKey.shardingKey()) + .setWorkToken(work.id().workToken()) + .setCacheToken(work.id().cacheToken()) + .addAllLatencyAttribution(work.getLatencyAttributions(/* isHeartbeat= */ true, sampler)) + .build(); + } + + @AutoValue + abstract static class HeartbeatRow { + + private static HeartbeatRow create(RefreshableWork work, HeartbeatRequest request) { + return new AutoValue_HeartbeatRequests_HeartbeatRow(work.heartbeatSender(), work, request); + } + + abstract HeartbeatSender sender(); + + abstract RefreshableWork work(); + + abstract HeartbeatRequest request(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java new file mode 100644 index 0000000000000..c35cd6891aadb --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +/** Interface for sending heartbeats. */ +@FunctionalInterface +public interface HeartbeatSender { + /** + * Send heartbeats. Heartbeats represent WorkItem that is actively being processed belonging to + * the computation. + */ + void sendHeartbeats(Heartbeat heartbeats); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java new file mode 100644 index 0000000000000..4a1a6bc49301a --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.sdk.annotations.Internal; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** StreamingEngine stream pool based implementation of {@link HeartbeatSender}. */ +@Internal +public final class StreamPoolHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(StreamPoolHeartbeatSender.class); + + private final WindmillStreamPool heartbeatStreamPool; + + public StreamPoolHeartbeatSender( + WindmillStreamPool heartbeatStreamPool) { + this.heartbeatStreamPool = heartbeatStreamPool; + } + + @Override + public void sendHeartbeats(Heartbeat heartbeats) { + try (CloseableStream closeableStream = + heartbeatStreamPool.getCloseableStream()) { + closeableStream.stream().refreshActiveWork(heartbeats.heartbeatRequests()); + } catch (Exception e) { + LOG.warn("Error occurred sending heartbeats=[{}].", heartbeats, e); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index 7514c526bbd16..b76a16aab573d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -89,10 +89,10 @@ public final class FakeWindmillServer extends WindmillServerStub { private final AtomicInteger expectedExceptionCount; private final ErrorCollector errorCollector; private final ConcurrentHashMap> droppedStreamingCommits; - private int commitsRequested = 0; private final List getDataRequests = new ArrayList<>(); - private boolean dropStreamingCommits = false; private final Consumer> processHeartbeatResponses; + private int commitsRequested = 0; + private boolean dropStreamingCommits = false; @GuardedBy("this") private ImmutableSet dispatcherEndpoints; @@ -230,6 +230,19 @@ public GetWorkStream getWorkStream(Windmill.GetWorkRequest request, WorkItemRece Instant startTime = Instant.now(); final CountDownLatch done = new CountDownLatch(1); return new GetWorkStream() { + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} + + @Override + public boolean isShutdown() { + return false; + } + @Override public void close() { done.countDown(); @@ -293,6 +306,19 @@ public Instant startTime() { public GetDataStream getDataStream() { Instant startTime = Instant.now(); return new GetDataStream() { + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} + + @Override + public boolean isShutdown() { + return false; + } + @Override public Windmill.KeyedGetDataResponse requestKeyedData( String computation, KeyedGetDataRequest request) { @@ -367,18 +393,21 @@ public CommitWorkStream commitWorkStream() { return new CommitWorkStream() { @Override - public RequestBatcher batcher() { - return new RequestBatcher() { - class RequestAndDone { - final Consumer onDone; - final WorkItemCommitRequest request; + public String backendWorkerToken() { + return ""; + } - RequestAndDone(WorkItemCommitRequest request, Consumer onDone) { - this.request = request; - this.onDone = onDone; - } - } + @Override + public void shutdown() {} + + @Override + public boolean isShutdown() { + return false; + } + @Override + public RequestBatcher batcher() { + return new RequestBatcher() { final List requests = new ArrayList<>(); @Override @@ -426,6 +455,16 @@ public void flush() { } requests.clear(); } + + class RequestAndDone { + final Consumer onDone; + final WorkItemCommitRequest request; + + RequestAndDone(WorkItemCommitRequest request, Consumer onDone) { + this.request = request; + this.onDone = onDone; + } + } }; } @@ -522,13 +561,13 @@ public ArrayList getStatsReceived() { } @Override - public synchronized void setWindmillServiceEndpoints(Set endpoints) { - this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); + public synchronized ImmutableSet getWindmillServiceEndpoints() { + return dispatcherEndpoints; } @Override - public synchronized ImmutableSet getWindmillServiceEndpoints() { - return dispatcherEndpoints; + public synchronized void setWindmillServiceEndpoints(Set endpoints) { + this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); } public static class ResponseQueue { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 17430df611ce7..200a30537dae8 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -126,6 +126,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.Timer.Type; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WatermarkHold; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; import org.apache.beam.sdk.coders.CollectionCoder; @@ -332,7 +333,8 @@ private static ExecutableWork createMockWork( Work.createProcessingContext( computationId, (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), processWorkFn); @@ -3406,7 +3408,8 @@ public void testLatencyAttributionProtobufsPopulated() { Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), clock, Collections.emptyList()); @@ -3708,7 +3711,7 @@ public void testDoFnActiveMessageMetadataReportedOnHeartbeat() throws Exception Map result = server.waitForAndGetCommits(1); assertThat(server.numGetDataRequests(), greaterThan(0)); - Windmill.GetDataRequest heartbeat = server.getGetDataRequests().get(2); + Windmill.GetDataRequest heartbeat = server.getGetDataRequests().get(1); for (LatencyAttribution la : heartbeat diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 6c46bda5acfe1..72b8cff6ebd53 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -24,6 +24,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.CounterMetadata; import com.google.api.services.dataflow.model.CounterStructuredName; @@ -62,6 +63,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.options.PipelineOptionsFactory; @@ -82,7 +84,6 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.mockito.Mock; -import org.mockito.Mockito; import org.mockito.MockitoAnnotations; /** Tests for {@link StreamingModeExecutionContext}. */ @@ -135,7 +136,8 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm Work.createProcessingContext( COMPUTATION_ID, (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -241,8 +243,8 @@ public void testSideInputReaderReconstituted() { @Test public void extractMsecCounters() { - MetricsContainer metricsContainer = Mockito.mock(MetricsContainer.class); - ProfileScope profileScope = Mockito.mock(ProfileScope.class); + MetricsContainer metricsContainer = mock(MetricsContainer.class); + ProfileScope profileScope = mock(ProfileScope.class); ExecutionState start1 = executionContext.executionStateRegistry.getState( NameContext.create("stage", "original-1", "system-1", "user-1"), diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java index 5d8ebd53400c6..4f4d7420dc48e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java @@ -97,6 +97,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.BigEndianIntegerCoder; import org.apache.beam.sdk.coders.Coder; @@ -199,7 +200,8 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm Work.createProcessingContext( COMPUTATION_ID, (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -1000,7 +1002,8 @@ public void testFailedWorkItemsAbort() throws Exception { Work.createProcessingContext( COMPUTATION_ID, (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - gnored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); context.start( diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index 3a3e0a34c2179..c5e24739c5d69 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -38,6 +38,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; @@ -87,7 +88,8 @@ private static Work.ProcessingContext createWorkProcessingContext() { return Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}); + ignored -> {}, + mock(HeartbeatSender.class)); } private static WorkId workId(long workToken, long cacheToken) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java index 3c1683ecf436a..84a7c593d1530 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java @@ -38,6 +38,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.fn.IdGenerators; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -69,7 +70,8 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), ignored -> {}); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java index e08c951975fa3..35f4aad65ae74 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import java.util.Collections; import java.util.concurrent.CountDownLatch; @@ -31,6 +32,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Instant; @@ -66,7 +68,8 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), executeWorkFn); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java index a2f5e71d04c3f..fec01fc37c3a3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java @@ -38,12 +38,12 @@ @RunWith(JUnit4.class) public class WindmillStreamPoolTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private static final int DEFAULT_NUM_STREAMS = 10; private static final int NEW_STREAM_HOLDS = 2; private final ConcurrentHashMap< TestWindmillStream, WindmillStreamPool.StreamData> holds = new ConcurrentHashMap<>(); + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private List> streams; @Before @@ -250,5 +250,25 @@ public boolean awaitTermination(int time, TimeUnit unit) { public Instant startTime() { return startTime; } + + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() { + close(); + } + + @Override + public boolean isShutdown() { + return closed; + } + + @Override + public Type streamType() { + return Type.GET_DATA; + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java index 85e07c3bd797c..cfaed7ba5289b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java @@ -19,6 +19,7 @@ import static com.google.common.truth.Truth.assertThat; import static org.junit.Assert.assertNotNull; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.MapTask; import com.google.common.truth.Correspondence; @@ -35,6 +36,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Instant; @@ -45,7 +47,6 @@ import org.junit.rules.ErrorCollector; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import org.mockito.Mockito; @RunWith(JUnit4.class) public class StreamingApplianceWorkCommitterTest { @@ -67,7 +68,8 @@ private static Work createMockWork(long workToken) { (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), ignored -> { throw new UnsupportedOperationException(); - }), + }, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -76,7 +78,7 @@ private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, new MapTask().setSystemName("system").setStageName("stage"), - Mockito.mock(BoundedQueueExecutor.class), + mock(BoundedQueueExecutor.class), ImmutableMap.of(), null); } @@ -90,7 +92,7 @@ private StreamingApplianceWorkCommitter createWorkCommitter( public void setUp() { fakeWindmillServer = new FakeWindmillServer( - errorCollector, ignored -> Optional.of(Mockito.mock(ComputationState.class))); + errorCollector, ignored -> Optional.of(mock(ComputationState.class))); } @After diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java index d53690938aef7..3de55684caa85 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java @@ -21,6 +21,7 @@ import static org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitStatus.OK; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.MapTask; import java.io.IOException; @@ -49,6 +50,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; @@ -60,7 +62,6 @@ import org.junit.rules.ErrorCollector; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import org.mockito.Mockito; @RunWith(JUnit4.class) public class StreamingEngineWorkCommitterTest { @@ -84,7 +85,8 @@ private static Work createMockWork(long workToken) { (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), ignored -> { throw new UnsupportedOperationException(); - }), + }, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -93,7 +95,7 @@ private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, new MapTask().setSystemName("system").setStageName("stage"), - Mockito.mock(BoundedQueueExecutor.class), + mock(BoundedQueueExecutor.class), ImmutableMap.of(), null); } @@ -110,7 +112,7 @@ private static CompleteCommit asCompleteCommit(Commit commit, Windmill.CommitSta public void setUp() throws IOException { fakeWindmillServer = new FakeWindmillServer( - errorCollector, ignored -> Optional.of(Mockito.mock(ComputationState.class))); + errorCollector, ignored -> Optional.of(mock(ComputationState.class))); commitWorkStreamFactory = WindmillStreamPool.create( 1, Duration.standardMinutes(1), fakeWindmillServer::commitWorkStream) @@ -284,6 +286,19 @@ public boolean awaitTermination(int time, TimeUnit unit) { public Instant startTime() { return Instant.now(); } + + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} + + @Override + public boolean isShutdown() { + return false; + } }; commitWorkStreamFactory = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java index 162c69509ae15..a515ff4161a47 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java @@ -27,9 +27,9 @@ import static org.mockito.Mockito.when; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; @@ -66,7 +66,7 @@ public class WindmillStreamSenderTest { (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}; @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private ManagedChannel inProcessChannel; - private CloudWindmillServiceV1Alpha1Stub stub; + private WindmillConnection connection; @Before public void setUp() { @@ -74,7 +74,10 @@ public void setUp() { grpcCleanup.register( InProcessChannelBuilder.forName("WindmillStreamSenderTest").directExecutor().build()); grpcCleanup.register(inProcessChannel); - stub = CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); + connection = + WindmillConnection.builder() + .setStub(CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel)) + .build(); } @After @@ -95,7 +98,7 @@ public void testStartStream_startsAllStreams() { verify(streamFactory) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -107,8 +110,8 @@ public void testStartStream_startsAllStreams() { any(), eq(workItemScheduler)); - verify(streamFactory).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory).createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory).createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -126,7 +129,7 @@ public void testStartStream_onlyStartsStreamsOnce() { verify(streamFactory, times(1)) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -138,8 +141,10 @@ public void testStartStream_onlyStartsStreamsOnce() { any(), eq(workItemScheduler)); - verify(streamFactory, times(1)).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory, times(1)).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -160,7 +165,7 @@ public void testStartStream_onlyStartsStreamsOnceConcurrent() throws Interrupted verify(streamFactory, times(1)) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -172,8 +177,10 @@ public void testStartStream_onlyStartsStreamsOnceConcurrent() throws Interrupted any(), eq(workItemScheduler)); - verify(streamFactory, times(1)).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory, times(1)).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -198,7 +205,7 @@ public void testCloseAllStreams_closesAllStreams() { CommitWorkStream mockCommitWorkStream = mock(CommitWorkStream.class); when(mockStreamFactory.createDirectGetWorkStream( - eq(stub), + eq(connection), eq(getWorkRequestWithBudget), any(ThrottleTimer.class), any(), @@ -206,9 +213,9 @@ public void testCloseAllStreams_closesAllStreams() { eq(workItemScheduler))) .thenReturn(mockGetWorkStream); - when(mockStreamFactory.createGetDataStream(eq(stub), any(ThrottleTimer.class))) + when(mockStreamFactory.createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class))) .thenReturn(mockGetDataStream); - when(mockStreamFactory.createCommitWorkStream(eq(stub), any(ThrottleTimer.class))) + when(mockStreamFactory.createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class))) .thenReturn(mockCommitWorkStream); WindmillStreamSender windmillStreamSender = @@ -231,7 +238,7 @@ private WindmillStreamSender newWindmillStreamSender(GetWorkBudget budget) { private WindmillStreamSender newWindmillStreamSender( GetWorkBudget budget, GrpcWindmillStreamFactory streamFactory) { return WindmillStreamSender.create( - stub, + connection, GET_WORK_REQUEST, budget, streamFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java index 83ae8aa22ce3e..68f4559bfd3f0 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java @@ -31,6 +31,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; @@ -244,7 +245,7 @@ public void testDistributeBudget_distributesFairlyWhenNotEven() { private WindmillStreamSender createWindmillStreamSender(GetWorkBudget getWorkBudget) { return WindmillStreamSender.create( - stub, + WindmillConnection.builder().setStub(stub).build(), Windmill.GetWorkRequest.newBuilder() .setClientId(1L) .setJobId("job") diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java index bd55595da1355..079c6b4640449 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; import static com.google.common.truth.Truth.assertThat; +import static org.mockito.Mockito.mock; import java.util.ArrayList; import java.util.HashSet; @@ -34,6 +35,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; @@ -87,7 +89,8 @@ private static ExecutableWork createWork(Supplier clock, Consumer Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + ignored -> {}, + mock(HeartbeatSender.class)), clock, new ArrayList<>()), processWorkFn); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java similarity index 80% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index 13019116767c2..845687b457b08 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -38,6 +38,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.ExecutableWork; @@ -46,12 +47,12 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.direct.Clock; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBasedTable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; @@ -61,11 +62,11 @@ import org.junit.runners.JUnit4; @RunWith(JUnit4.class) -public class DispatchedActiveWorkRefresherTest { - +public class ActiveWorkRefresherTest { private static final Supplier A_LONG_TIME_AGO = () -> Instant.parse("1998-09-04T00:00:00Z"); private static final String COMPUTATION_ID_PREFIX = "ComputationId-"; + private final HeartbeatSender heartbeatSender = mock(HeartbeatSender.class); private static BoundedQueueExecutor workExecutor() { return new BoundedQueueExecutor( @@ -97,15 +98,20 @@ private ActiveWorkRefresher createActiveWorkRefresher( int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, - Consumer>> activeWorkRefresherFn) { - return new DispatchedActiveWorkRefresher( + Consumer> activeWorkRefresherFn) { + return new ActiveWorkRefresher( clock, activeWorkRefreshPeriodMillis, stuckCommitDurationMillis, computations, DataflowExecutionStateSampler.instance(), - activeWorkRefresherFn, - Executors.newSingleThreadScheduledExecutor()); + Executors.newSingleThreadScheduledExecutor(), + activeWorkRefresherFn); + } + + private ExecutableWork createOldWork(int workIds, Consumer processWork) { + ShardedKey shardedKey = ShardedKey.create(ByteString.EMPTY, workIds); + return createOldWork(shardedKey, workIds, processWork); } private ExecutableWork createOldWork( @@ -122,8 +128,9 @@ private ExecutableWork createOldWork( Work.createProcessingContext( "computationId", (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), - DispatchedActiveWorkRefresherTest.A_LONG_TIME_AGO, + ignored -> {}, + heartbeatSender), + A_LONG_TIME_AGO, ImmutableList.of()), processWork); } @@ -147,8 +154,7 @@ public void testActiveWorkRefresh() throws InterruptedException { Map> computationsAndWork = new HashMap<>(); for (int i = 0; i < 5; i++) { ComputationState computationState = createComputationState(i); - ExecutableWork fakeWork = - createOldWork(ShardedKey.create(ByteString.EMPTY, i), i, processWork); + ExecutableWork fakeWork = createOldWork(i, processWork); computationState.activateWork(fakeWork); computations.add(computationState); @@ -158,10 +164,9 @@ public void testActiveWorkRefresh() throws InterruptedException { activeWorkForComputation.add(fakeWork); } - Map> expectedHeartbeats = new HashMap<>(); + Map fanoutExpectedHeartbeats = new HashMap<>(); CountDownLatch heartbeatsSent = new CountDownLatch(1); TestClock fakeClock = new TestClock(Instant.now()); - ActiveWorkRefresher activeWorkRefresher = createActiveWorkRefresher( fakeClock::now, @@ -169,7 +174,7 @@ public void testActiveWorkRefresh() throws InterruptedException { 0, () -> computations, heartbeats -> { - expectedHeartbeats.putAll(heartbeats); + fanoutExpectedHeartbeats.putAll(heartbeats); heartbeatsSent.countDown(); }); @@ -178,23 +183,30 @@ public void testActiveWorkRefresh() throws InterruptedException { heartbeatsSent.await(); activeWorkRefresher.stop(); - assertThat(computationsAndWork.size()).isEqualTo(expectedHeartbeats.size()); - for (Map.Entry> expectedHeartbeat : - expectedHeartbeats.entrySet()) { - String computationId = expectedHeartbeat.getKey(); - List heartbeatRequests = expectedHeartbeat.getValue(); - List work = computationsAndWork.get(computationId); - - // Compare the heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys. - assertThat(heartbeatRequests) - .comparingElementsUsing( - Correspondence.from( - (HeartbeatRequest h, ExecutableWork w) -> - h.getWorkToken() == w.getWorkItem().getWorkToken() - && h.getCacheToken() == w.getWorkItem().getWorkToken() - && h.getShardingKey() == w.getWorkItem().getShardingKey(), - "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) - .containsExactlyElementsIn(work); + assertThat(computationsAndWork.size()) + .isEqualTo( + Iterables.getOnlyElement(fanoutExpectedHeartbeats.values()).heartbeatRequests().size()); + for (Map.Entry fanOutExpectedHeartbeat : + fanoutExpectedHeartbeats.entrySet()) { + for (Map.Entry> expectedHeartbeat : + fanOutExpectedHeartbeat.getValue().heartbeatRequests().entrySet()) { + String computationId = expectedHeartbeat.getKey(); + List heartbeatRequests = expectedHeartbeat.getValue(); + List work = + computationsAndWork.get(computationId).stream() + .map(ExecutableWork::work) + .collect(Collectors.toList()); + // Compare the heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys. + assertThat(heartbeatRequests) + .comparingElementsUsing( + Correspondence.from( + (Windmill.HeartbeatRequest h, Work w) -> + h.getWorkToken() == w.getWorkItem().getWorkToken() + && h.getCacheToken() == w.getWorkItem().getWorkToken() + && h.getShardingKey() == w.getWorkItem().getShardingKey(), + "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) + .containsExactlyElementsIn(work); + } } activeWorkRefresher.stop(); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java new file mode 100644 index 0000000000000..1f0a12383ea40 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.auto.value.AutoValue; +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.streaming.ShardedKey; +import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; +import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; +import org.joda.time.Instant; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class HeartbeatRequestsTest { + + private Map> activeWork; + + private static Work createWork(Windmill.WorkItem workItem, HeartbeatSender heartbeatSender) { + return Work.create( + workItem, + Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), + createProcessingContext(heartbeatSender), + Instant::now, + Collections.emptyList()); + } + + private static ShardedKey shardedKey(String str, long shardKey) { + return ShardedKey.create(ByteString.copyFromUtf8(str), shardKey); + } + + private static Work.ProcessingContext createProcessingContext(HeartbeatSender heartbeatSender) { + return Work.createProcessingContext( + "computationId", + (computationId, request) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + ignored -> {}, + heartbeatSender); + } + + private static Work expiredWork(Windmill.WorkItem workItem, HeartbeatSender heartbeatSender) { + return Work.create( + workItem, + Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), + createProcessingContext(heartbeatSender), + () -> Instant.EPOCH, + Collections.emptyList()); + } + + private static Windmill.WorkItem createWorkItem(long workToken, long cacheToken) { + return Windmill.WorkItem.newBuilder() + .setKey(ByteString.copyFromUtf8("")) + .setShardingKey(1) + .setWorkToken(workToken) + .setCacheToken(cacheToken) + .build(); + } + + @Before + public void setUp() { + activeWork = new HashMap<>(); + } + + @Test + public void testGetRefreshableFanoutKeyHeartbeats() { + Instant refreshDeadline = Instant.now(); + HeartbeatSender sender1 = ignored -> {}; + HeartbeatSender sender2 = ignored -> {}; + + Work freshWork = createWork(createWorkItem(3L, 3L), sender1); + Work refreshableWork1 = expiredWork(createWorkItem(1L, 1L), sender1); + refreshableWork1.setState(Work.State.COMMITTING); + Work refreshableWork2 = expiredWork(createWorkItem(2L, 2L), sender2); + refreshableWork2.setState(Work.State.COMMITTING); + ShardedKey shardedKey1 = shardedKey("someKey", 1L); + ShardedKey shardedKey2 = shardedKey("anotherKey", 2L); + + activateWorkForKey(shardedKey1, refreshableWork1); + activateWorkForKey(shardedKey1, freshWork); + activateWorkForKey(shardedKey2, refreshableWork2); + + Table requests = + HeartbeatRequests.getRefreshableKeyHeartbeats( + currentActiveWork(), refreshDeadline, DataflowExecutionStateSampler.instance()); + + ImmutableList expected = + ImmutableList.of( + HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from(shardedKey1, refreshableWork1), + HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from(shardedKey2, refreshableWork2)); + + ImmutableList actual = + requests.cellSet().stream() + .map( + entry -> + HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( + entry.getValue(), entry.getRowKey())) + .collect(toImmutableList()); + + assertThat(actual).containsExactlyElementsIn(expected); + } + + private void activateWorkForKey(ShardedKey shardedKey, Work work) { + Deque workQueue = activeWork.computeIfAbsent(shardedKey, ignored -> new ArrayDeque<>()); + workQueue.addLast(work); + } + + private ImmutableListMultimap currentActiveWork() { + ImmutableListMultimap.Builder currentActiveWork = + ImmutableListMultimap.builder(); + + for (Map.Entry> keyedWorkQueues : activeWork.entrySet()) { + currentActiveWork.putAll( + keyedWorkQueues.getKey(), + keyedWorkQueues.getValue().stream() + .map(Work::refreshableView) + .collect(Collectors.toList())); + } + + return currentActiveWork.build(); + } + + @AutoValue + abstract static class HeartbeatRequestShardingKeyWorkTokenAndCacheToken { + + private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken create( + long shardingKey, long workToken, long cacheToken, HeartbeatSender sender) { + return new AutoValue_HeartbeatRequestsTest_HeartbeatRequestShardingKeyWorkTokenAndCacheToken( + shardingKey, workToken, cacheToken, sender); + } + + private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( + ShardedKey shardedKey, Work work) { + return create( + shardedKey.shardingKey(), + work.getWorkItem().getWorkToken(), + work.getWorkItem().getCacheToken(), + work.heartbeatSender()); + } + + private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( + Windmill.HeartbeatRequest heartbeatRequest, HeartbeatSender sender) { + return create( + heartbeatRequest.getShardingKey(), + heartbeatRequest.getWorkToken(), + heartbeatRequest.getCacheToken(), + sender); + } + + abstract long shardingKey(); + + abstract long workToken(); + + abstract long cacheToken(); + + abstract HeartbeatSender heartbeatSender(); + + @Override + public final boolean equals(Object obj) { + if (!(obj instanceof HeartbeatRequestShardingKeyWorkTokenAndCacheToken)) { + return false; + } + HeartbeatRequestShardingKeyWorkTokenAndCacheToken other = + (HeartbeatRequestShardingKeyWorkTokenAndCacheToken) obj; + return shardingKey() == other.shardingKey() + && workToken() == other.workToken() + && cacheToken() == other.cacheToken(); + } + + @Override + public final int hashCode() { + return Objects.hash(shardingKey(), workToken(), cacheToken()); + } + } +} From f6cceaf9abb4ea06cb42a133362e3d8438b7f9cb Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Fri, 12 Jul 2024 00:38:24 +0900 Subject: [PATCH 3/7] address PR comments --- .../worker/StreamingDataflowWorker.java | 24 +- .../worker/streaming/ActiveWorkState.java | 42 +--- .../worker/streaming/ComputationState.java | 5 + .../worker/streaming/RefreshableWork.java | 6 +- .../dataflow/worker/streaming/Work.java | 218 +++++------------- .../worker/windmill/WindmillConnection.java | 7 +- .../client/AbstractWindmillStream.java | 6 +- .../windmill/client/WindmillStream.java | 5 +- .../windmill/client/WindmillStreamPool.java | 4 +- .../getdata/ApplianceGetDataClient.java | 25 +- .../getdata/FanOutWorkRefreshClient.java | 13 +- .../getdata/StreamingEngineGetDataClient.java | 24 +- .../ThrottlingGetDataMetricTracker.java | 5 +- .../client/getdata/WorkRefreshClient.java | 4 +- .../client/grpc/GrpcDirectGetWorkStream.java | 4 +- .../client/grpc/GrpcGetDataStream.java | 7 +- .../grpc/GrpcWindmillStreamFactory.java | 2 +- .../client/grpc/StreamingEngineClient.java | 2 +- .../client/grpc/WindmillStreamSender.java | 6 +- .../processing/StreamingWorkScheduler.java | 3 +- .../work/refresh/ActiveWorkRefresher.java | 54 +++-- .../refresh/ApplianceHeartbeatSender.java | 8 +- ...r.java => FixedStreamHeartbeatSender.java} | 22 +- .../windmill/work/refresh/Heartbeat.java | 40 ---- .../work/refresh/HeartbeatRequests.java | 84 ------- .../work/refresh/HeartbeatSender.java | 2 +- .../windmill/work/refresh/Heartbeats.java | 67 ++++++ .../refresh/StreamPoolHeartbeatSender.java | 4 +- .../dataflow/worker/FakeWindmillServer.java | 13 +- .../worker/StreamingDataflowWorkerTest.java | 2 +- .../worker/streaming/ActiveWorkStateTest.java | 71 ------ .../client/WindmillStreamPoolTest.java | 4 +- .../StreamingEngineWorkCommitterTest.java | 2 +- .../grpc/GrpcGetWorkerMetadataStreamTest.java | 2 +- .../client/grpc/GrpcWindmillServerTest.java | 27 ++- .../client/grpc/WindmillStreamSenderTest.java | 6 +- .../work/refresh/ActiveWorkRefresherTest.java | 12 +- .../work/refresh/HeartbeatRequestsTest.java | 206 ----------------- 38 files changed, 288 insertions(+), 750 deletions(-) rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/{DirectHeartbeatSender.java => FixedStreamHeartbeatSender.java} (74%) delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java delete mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 9ae2248afa1d5..b2b4b0796db59 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -168,7 +168,6 @@ public class StreamingDataflowWorker { private final DataflowWorkerHarnessOptions options; private final long clientId; private final GetDataClient getDataClient; - private final WorkRefreshClient workRefreshClient; private final MemoryMonitor memoryMonitor; private final Thread memoryMonitorThread; private final ReaderCache readerCache; @@ -259,26 +258,27 @@ private StreamingDataflowWorker( GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream); + // Register standard file systems. + FileSystems.setDefaultPipelineOptions(options); + + int stuckCommitDurationMillis = + windmillServiceEnabled && options.getStuckCommitDurationMillis() > 0 + ? options.getStuckCommitDurationMillis() + : 0; + + WorkRefreshClient workRefreshClient; if (windmillServiceEnabled) { StreamingEngineGetDataClient streamingEngineGetDataClient = new StreamingEngineGetDataClient(getDataMetricTracker, getDataStreamPool); this.getDataClient = streamingEngineGetDataClient; - this.workRefreshClient = streamingEngineGetDataClient; + workRefreshClient = streamingEngineGetDataClient; } else { ApplianceGetDataClient applianceGetDataClient = new ApplianceGetDataClient(windmillServer, getDataMetricTracker); this.getDataClient = applianceGetDataClient; - this.workRefreshClient = applianceGetDataClient; + workRefreshClient = applianceGetDataClient; } - // Register standard file systems. - FileSystems.setDefaultPipelineOptions(options); - - int stuckCommitDurationMillis = - windmillServiceEnabled && options.getStuckCommitDurationMillis() > 0 - ? options.getStuckCommitDurationMillis() - : 0; - this.activeWorkRefresher = new ActiveWorkRefresher( clock, @@ -914,7 +914,7 @@ void streamingDispatchLoop() { // If at any point the server closes the stream, we will reconnect immediately; otherwise // we half-close the stream after some time and create a new one. if (!stream.awaitTermination(GET_WORK_STREAM_TIMEOUT_MINUTES, TimeUnit.MINUTES)) { - stream.close(); + stream.halfClose(); } } catch (InterruptedException e) { // Continue processing until !running.get() diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 5b7e04269f440..64309d0a75010 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -32,13 +32,9 @@ import java.util.Queue; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; -import java.util.stream.Stream; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; @@ -108,29 +104,6 @@ private static String elapsedString(Instant start, Instant end) { return activeFor.toString().substring(2); } - private static Stream toHeartbeatRequestStream( - Entry> shardedKeyAndWorkQueue, - Instant refreshDeadline, - DataflowExecutionStateSampler sampler) { - ShardedKey shardedKey = shardedKeyAndWorkQueue.getKey(); - Deque workQueue = shardedKeyAndWorkQueue.getValue(); - - return workQueue.stream() - .map(ExecutableWork::work) - .filter(work -> work.getStartTime().isBefore(refreshDeadline)) - // Don't send heartbeats for queued work we already know is failed. - .filter(work -> !work.isFailed()) - .map( - work -> - Windmill.HeartbeatRequest.newBuilder() - .setShardingKey(shardedKey.shardingKey()) - .setWorkToken(work.getWorkItem().getWorkToken()) - .setCacheToken(work.getWorkItem().getCacheToken()) - .addAllLatencyAttribution( - work.getLatencyAttributions(/* isHeartbeat= */ true, sampler)) - .build()); - } - /** * Activates {@link Work} for the {@link ShardedKey}. Outcome can be 1 of 4 {@link * ActivateWorkResult} @@ -236,6 +209,14 @@ synchronized ImmutableListMultimap getReadOnlyActiv e -> e.getValue().stream().map(ExecutableWork::work).map(Work::refreshableView))); } + synchronized ImmutableList getRefreshableWork(Instant refreshDeadline) { + return activeWork.values().stream() + .flatMap(Deque::stream) + .map(ExecutableWork::work) + .filter(work -> work.isRefreshable(refreshDeadline)) + .collect(toImmutableList()); + } + private void incrementActiveWorkBudget(Work work) { activeGetWorkBudget.updateAndGet( getWorkBudget -> getWorkBudget.apply(1, work.getWorkItem().getSerializedSize())); @@ -341,13 +322,6 @@ private synchronized ImmutableMap getStuckCommitsAt( return stuckCommits.build(); } - synchronized ImmutableList getKeyHeartbeats( - Instant refreshDeadline, DataflowExecutionStateSampler sampler) { - return activeWork.entrySet().stream() - .flatMap(entry -> toHeartbeatRequestStream(entry, refreshDeadline, sampler)) - .collect(toImmutableList()); - } - /** * Returns the current aggregate {@link GetWorkBudget} that is active on the user worker. Active * means that the work is received from Windmill, being processed or queued to be processed in diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java index 789dac7dfcf1e..f3b0ba16fbe2d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java @@ -28,6 +28,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; @@ -149,6 +150,10 @@ public ImmutableListMultimap currentActiveWorkReadO return activeWorkState.getReadOnlyActiveWork(); } + public ImmutableList getRefreshableWork(Instant refreshDeadline) { + return activeWorkState.getRefreshableWork(refreshDeadline); + } + public GetWorkBudget getActiveWorkBudget() { return activeWorkState.currentActiveWorkBudget(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java index ebbf1911ef4a4..a1668d9ae7851 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java @@ -30,12 +30,14 @@ public interface RefreshableWork { WorkId id(); + ShardedKey getShardedKey(); + boolean isRefreshable(Instant refreshDeadline); HeartbeatSender heartbeatSender(); - ImmutableList getLatencyAttributions( - boolean isHeartbeat, DataflowExecutionStateSampler sampler); + ImmutableList getHeartbeatLatencyAttributions( + DataflowExecutionStateSampler sampler); void setFailed(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index 1a0012f040eff..6a58ef93069fe 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -20,7 +20,6 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; import com.google.auto.value.AutoValue; -import com.google.common.base.Objects; import java.util.Arrays; import java.util.Collection; import java.util.EnumMap; @@ -32,7 +31,6 @@ import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; -import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Pair; import org.apache.beam.runners.dataflow.worker.ActiveMessageMetadata; @@ -53,8 +51,6 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Represents the state of an attempt to process a {@link WorkItem} by executing user code. @@ -64,8 +60,6 @@ @NotThreadSafe @Internal public final class Work implements RefreshableWork { - private static final Logger LOG = LoggerFactory.getLogger(Work.class); - private final ShardedKey shardedKey; private final WorkItem workItem; private final ProcessingContext processingContext; @@ -75,35 +69,28 @@ public final class Work implements RefreshableWork { private final Map totalDurationPerState; private final WorkId id; private final String latencyTrackingId; - private final Runnable onFailed; private TimedState currentState; private volatile boolean isFailed; private Work( - ShardedKey shardedKey, WorkItem workItem, - ProcessingContext processingContext, Watermarks watermarks, - Supplier clock, - Instant startTime, - Map totalDurationPerState, - WorkId id, - String latencyTrackingId, - Runnable onFailed, - TimedState currentState, - boolean isFailed) { - this.shardedKey = shardedKey; + ProcessingContext processingContext, + Supplier clock) { + this.shardedKey = ShardedKey.create(workItem.getKey(), workItem.getShardingKey()); this.workItem = workItem; + this.processingContext = processingContext; this.watermarks = watermarks; this.clock = clock; - this.startTime = startTime; - this.totalDurationPerState = totalDurationPerState; - this.id = id; - this.latencyTrackingId = latencyTrackingId; - this.onFailed = onFailed; - this.currentState = currentState; - this.isFailed = isFailed; - this.processingContext = processingContext; + this.startTime = clock.get(); + this.totalDurationPerState = new EnumMap<>(LatencyAttribution.State.class); + this.id = WorkId.of(workItem); + this.latencyTrackingId = + Long.toHexString(workItem.getShardingKey()) + + '-' + + Long.toHexString(workItem.getWorkToken()); + this.currentState = TimedState.initialState(startTime); + this.isFailed = false; } public static Work create( @@ -112,21 +99,7 @@ public static Work create( ProcessingContext processingContext, Supplier clock, Collection getWorkStreamLatencies) { - Instant startTime = clock.get(); - Work work = - new Work( - ShardedKey.create(workItem.getKey(), workItem.getShardingKey()), - workItem, - processingContext, - watermarks, - clock, - startTime, - new EnumMap<>(LatencyAttribution.State.class), - WorkId.of(workItem), - buildLatencyTrackingId(workItem), - () -> {}, - TimedState.initialState(startTime), - false); + Work work = new Work(workItem, watermarks, processingContext, clock); work.recordGetWorkStreamLatencies(getWorkStreamLatencies); return work; } @@ -136,8 +109,7 @@ public static ProcessingContext createProcessingContext( BiFunction getKeyedDataFn, Consumer workCommitter, HeartbeatSender heartbeatSender) { - return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter, heartbeatSender) - .build(); + return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter, heartbeatSender); } private static LatencyAttribution.Builder createLatencyAttributionWithActiveLatencyBreakdown( @@ -177,33 +149,15 @@ private static LatencyAttribution.Builder createLatencyAttributionWithActiveLate return latencyAttribution; } - private static String buildLatencyTrackingId(WorkItem workItem) { - return Long.toHexString(workItem.getShardingKey()) - + '-' - + Long.toHexString(workItem.getWorkToken()); - } - - /** Returns a new {@link Work} instance with the same state and a different failure handler. */ - public Work withFailureHandler(Runnable onFailed) { - return new Work( - shardedKey, - workItem, - processingContext, - watermarks, - clock, - startTime, - totalDurationPerState, - id, - latencyTrackingId, - onFailed, - currentState, - isFailed); + public RefreshableWork refreshableView() { + return this; } public WorkItem getWorkItem() { return workItem; } + @Override public ShardedKey getShardedKey() { return shardedKey; } @@ -233,28 +187,9 @@ public void setState(State state) { this.currentState = TimedState.create(state, now); } - @Override - public boolean isRefreshable(Instant refreshDeadline) { - return getStartTime().isBefore(refreshDeadline) && !isFailed; - } - - @Override - public HeartbeatSender heartbeatSender() { - return processingContext.heartbeatSender(); - } - @Override public void setFailed() { - LOG.debug( - "Failing work: [computationId= " - + processingContext.computationId() - + ", key=" - + shardedKey - + ", workId=" - + id - + "]. The work will be retried and is not lost."); this.isFailed = true; - onFailed.run(); } public boolean isCommitPending() { @@ -269,6 +204,16 @@ public String getLatencyTrackingId() { return latencyTrackingId; } + @Override + public boolean isRefreshable(Instant refreshDeadline) { + return !isFailed && getStartTime().isBefore(refreshDeadline); + } + + @Override + public HeartbeatSender heartbeatSender() { + return processingContext.heartbeatSender(); + } + public void queueCommit(WorkItemCommitRequest commitRequest, ComputationState computationState) { setState(State.COMMIT_QUEUED); processingContext.workCommitter().accept(Commit.create(commitRequest, computationState, this)); @@ -291,7 +236,24 @@ private void recordGetWorkStreamLatencies(Collection getWork } @Override + public ImmutableList getHeartbeatLatencyAttributions( + DataflowExecutionStateSampler sampler) { + return getLatencyAttributions(/* isHeartbeat= */ true, sampler); + } + public ImmutableList getLatencyAttributions( + DataflowExecutionStateSampler sampler) { + return getLatencyAttributions(/* isHeartbeat= */ false, sampler); + } + + private Duration getTotalDurationAtLatencyAttributionState(LatencyAttribution.State state) { + Duration duration = totalDurationPerState.getOrDefault(state, Duration.ZERO); + return state == this.currentState.state().toLatencyAttributionState() + ? duration.plus(new Duration(this.currentState.startTime(), clock.get())) + : duration; + } + + private ImmutableList getLatencyAttributions( boolean isHeartbeat, DataflowExecutionStateSampler sampler) { return Arrays.stream(LatencyAttribution.State.values()) .map(state -> Pair.of(state, getTotalDurationAtLatencyAttributionState(state))) @@ -308,13 +270,6 @@ public ImmutableList getLatencyAttributions( .collect(toImmutableList()); } - private Duration getTotalDurationAtLatencyAttributionState(LatencyAttribution.State state) { - Duration duration = totalDurationPerState.getOrDefault(state, Duration.ZERO); - return state == this.currentState.state().toLatencyAttributionState() - ? duration.plus(new Duration(this.currentState.startTime(), clock.get())) - : duration; - } - private LatencyAttribution createLatencyAttribution( LatencyAttribution.State state, boolean isHeartbeat, @@ -335,55 +290,11 @@ public boolean isFailed() { return isFailed; } - public String backendWorkerToken() { - return processingContext.backendWorkerToken(); - } - boolean isStuckCommittingAt(Instant stuckCommitDeadline) { return currentState.state() == Work.State.COMMITTING && currentState.startTime().isBefore(stuckCommitDeadline); } - /** Returns a view of this {@link Work} instance for work refreshing. */ - public RefreshableWork refreshableView() { - return this; - } - - @Override - public boolean equals(@Nullable Object o) { - if (o == null) return false; - if (this == o) return true; - if (!(o instanceof Work)) return false; - Work work = (Work) o; - return isFailed == work.isFailed - && Objects.equal(shardedKey, work.shardedKey) - && Objects.equal(workItem, work.workItem) - && Objects.equal(processingContext, work.processingContext) - && Objects.equal(watermarks, work.watermarks) - && Objects.equal(clock, work.clock) - && Objects.equal(startTime, work.startTime) - && Objects.equal(totalDurationPerState, work.totalDurationPerState) - && Objects.equal(id, work.id) - && Objects.equal(latencyTrackingId, work.latencyTrackingId) - && Objects.equal(currentState, work.currentState); - } - - @Override - public int hashCode() { - return Objects.hashCode( - shardedKey, - workItem, - processingContext, - watermarks, - clock, - startTime, - totalDurationPerState, - id, - latencyTrackingId, - currentState, - isFailed); - } - public enum State { QUEUED(LatencyAttribution.State.QUEUED), PROCESSING(LatencyAttribution.State.ACTIVE), @@ -430,24 +341,19 @@ private boolean isCommitPending() { @AutoValue public abstract static class ProcessingContext { - private static final String UNKNOWN_BACKEND_WORKER_TOKEN = ""; - private static ProcessingContext.Builder create( + private static ProcessingContext create( String computationId, BiFunction getKeyedDataFn, Consumer workCommitter, HeartbeatSender heartbeatSender) { - return new AutoValue_Work_ProcessingContext.Builder() - .setBackendWorkerToken(UNKNOWN_BACKEND_WORKER_TOKEN) - .setComputationId(computationId) - .setHeartbeatSender(heartbeatSender) - .setWorkCommitter(workCommitter) - .setKeyedDataFetcher( - request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request))); + return new AutoValue_Work_ProcessingContext( + computationId, + request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request)), + heartbeatSender, + workCommitter); } - abstract String backendWorkerToken(); - /** Computation that the {@link Work} belongs to. */ public abstract String computationId(); @@ -455,30 +361,12 @@ private static ProcessingContext.Builder create( public abstract Function> keyedDataFetcher(); + public abstract HeartbeatSender heartbeatSender(); + /** * {@link WorkCommitter} that commits completed work to the backend Windmill worker handling the * {@link WorkItem}. */ public abstract Consumer workCommitter(); - - public abstract HeartbeatSender heartbeatSender(); - - public abstract Builder toBuilder(); - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setBackendWorkerToken(String value); - - abstract Builder setComputationId(String value); - - abstract Builder setKeyedDataFetcher( - Function> value); - - abstract Builder setWorkCommitter(Consumer value); - - abstract Builder setHeartbeatSender(HeartbeatSender value); - - public abstract ProcessingContext build(); - } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java index 37afe4bb1cf6b..7d199afc0861b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java @@ -27,6 +27,8 @@ @AutoValue @Internal public abstract class WindmillConnection { + private static final String NO_BACKEND_WORKER_TOKEN = ""; + public static WindmillConnection from( Endpoint windmillEndpoint, Function endpointToStubFn) { @@ -40,10 +42,11 @@ public static WindmillConnection from( } public static Builder builder() { - return new AutoValue_WindmillConnection.Builder(); + return new AutoValue_WindmillConnection.Builder() + .setBackendWorkerToken(NO_BACKEND_WORKER_TOKEN); } - public abstract Optional backendWorkerToken(); + public abstract String backendWorkerToken(); public abstract Optional directEndpoint(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index f41514fd4d745..b00c4c9c0c7fe 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -186,7 +186,7 @@ protected final void startStream() { requestObserver = requestObserverSupplier.get(); onNewStream(); if (clientClosed.get()) { - close(); + halfClose(); } return; } @@ -249,7 +249,7 @@ public final void appendSummaryHtml(PrintWriter writer) { protected abstract void appendSpecificHtml(PrintWriter writer); @Override - public final synchronized void close() { + public final synchronized void halfClose() { // Synchronization of close and onCompleted necessary for correct retry logic in onNewStream. clientClosed.set(true); requestObserver().onCompleted(); @@ -274,7 +274,7 @@ public String backendWorkerToken() { @Override public void shutdown() { if (isShutdown.compareAndSet(false, true)) { - close(); + halfClose(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index 5e79fe0a484e5..a4bfa69ad7798 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.client; import java.io.Closeable; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -37,7 +38,7 @@ public interface WindmillStream { String backendWorkerToken(); /** Indicates that no more requests will be sent. */ - void close(); + void halfClose(); /** Waits for the server to close its end of the connection, with timeout. */ boolean awaitTermination(int time, TimeUnit unit) throws InterruptedException; @@ -89,7 +90,7 @@ Windmill.KeyedGetDataResponse requestKeyedData( Windmill.GlobalData requestGlobalData(Windmill.GlobalDataRequest request); /** Tells windmill processing is ongoing for the given keys. */ - void refreshActiveWork(Map> heartbeats); + void refreshActiveWork(Map> heartbeats); void onHeartbeatResponse(List responses); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java index 0e4e085c066c4..f14fc40fdfdf9 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java @@ -128,7 +128,7 @@ public StreamT getStream() { return resultStream; } finally { if (closeThisStream != null) { - closeThisStream.close(); + closeThisStream.halfClose(); } } } @@ -166,7 +166,7 @@ public void releaseStream(StreamT stream) { } if (closeStream) { - stream.close(); + stream.halfClose(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java index 66c5100810de9..dc5adb4e7966c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java @@ -29,10 +29,11 @@ import org.apache.beam.runners.dataflow.worker.windmill.ApplianceWindmillClient; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; import org.checkerframework.checker.nullness.qual.Nullable; @@ -106,20 +107,20 @@ public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) * translate the HeartbeatRequest to a KeyedGetDataRequest. */ @Override - public void refreshActiveWork(Map heartbeats) { - if (heartbeats.isEmpty()) { + public void refreshActiveWork(Map heartbeats) { + Map.Entry heartbeat = + Iterables.getOnlyElement(heartbeats.entrySet()); + HeartbeatSender heartbeatSender = heartbeat.getKey(); + Heartbeats heartbeatToSend = heartbeat.getValue(); + + if (heartbeatToSend.heartbeatRequests().isEmpty()) { return; } - for (Map.Entry heartbeatToSend : heartbeats.entrySet()) { - HeartbeatSender heartbeatSender = heartbeatToSend.getKey(); - try (AutoCloseable ignored = - getDataMetricTracker.trackHeartbeats( - heartbeatToSend.getValue().heartbeatRequests().size())) { - heartbeatSender.sendHeartbeats(heartbeatToSend.getValue()); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); - } + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeatToSend.size())) { + heartbeatSender.sendHeartbeats(heartbeatToSend); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java index d38ae3120dbcf..426fb7b02f7ec 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java @@ -23,8 +23,8 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -48,9 +48,9 @@ public FanOutWorkRefreshClient(ThrottlingGetDataMetricTracker getDataMetricTrack } @Override - public void refreshActiveWork(Map heartbeats) { + public void refreshActiveWork(Map heartbeats) { List> fanOutRefreshActiveWork = new ArrayList<>(); - for (Map.Entry heartbeat : heartbeats.entrySet()) { + for (Map.Entry heartbeat : heartbeats.entrySet()) { fanOutRefreshActiveWork.add(sendHeartbeatOnStreamFuture(heartbeat)); } @@ -62,14 +62,13 @@ public void refreshActiveWork(Map heartbeats) { } private CompletableFuture sendHeartbeatOnStreamFuture( - Map.Entry heartbeat) { + Map.Entry heartbeat) { return CompletableFuture.runAsync( () -> { try (AutoCloseable ignored = - getDataMetricTracker.trackHeartbeats( - heartbeat.getValue().heartbeatRequests().size())) { + getDataMetricTracker.trackHeartbeats(heartbeat.getValue().size())) { HeartbeatSender sender = heartbeat.getKey(); - Heartbeat heartbeats = heartbeat.getValue(); + Heartbeats heartbeats = heartbeat.getValue(); sender.sendHeartbeats(heartbeats); } catch (Exception e) { throw new GetDataClient.GetDataException("Error refreshing heartbeats.", e); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java index fe2758682f3c5..54967f039f2d2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java @@ -26,9 +26,10 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; /** * StreamingEngine implementation of {@link GetDataClient}. @@ -82,19 +83,20 @@ public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { } @Override - public void refreshActiveWork(Map heartbeats) { - if (heartbeats.isEmpty()) { + public void refreshActiveWork(Map heartbeats) { + Map.Entry heartbeat = + Iterables.getOnlyElement(heartbeats.entrySet()); + HeartbeatSender heartbeatSender = heartbeat.getKey(); + Heartbeats heartbeatToSend = heartbeat.getValue(); + + if (heartbeatToSend.heartbeatRequests().isEmpty()) { return; } - for (Map.Entry heartbeatToSend : heartbeats.entrySet()) { - try (AutoCloseable ignored = - getDataMetricTracker.trackHeartbeats( - heartbeatToSend.getValue().heartbeatRequests().size())) { - heartbeatToSend.getKey().sendHeartbeats(heartbeatToSend.getValue()); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); - } + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeatToSend.size())) { + heartbeatSender.sendHeartbeats(heartbeatToSend); + } catch (Exception e) { + throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java index bc462df94bf7d..d356f205817a4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java @@ -59,11 +59,8 @@ public AutoCloseable trackHeartbeats(int numHeartbeats) { getDataMetrics .activeHeartbeats() .getAndUpdate(currentActiveHeartbeats -> currentActiveHeartbeats + numHeartbeats); - // Active heartbeats should never drop below 0. return () -> - getDataMetrics - .activeHeartbeats() - .getAndUpdate(existing -> Math.max(existing - numHeartbeats, 0)); + getDataMetrics.activeHeartbeats().getAndUpdate(existing -> existing - numHeartbeats); } public void printHtml(PrintWriter writer) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java index a5bcba9cf3da3..76f6147b07434 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java @@ -18,10 +18,10 @@ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; import java.util.Map; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeat; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; /** Client for requesting work refresh via heartbeats. */ public interface WorkRefreshClient { - void refreshActiveWork(Map heartbeats); + void refreshActiveWork(Map heartbeats); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 20fbf2fb7619e..19401866d13cf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -43,7 +43,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.DirectHeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.FixedStreamHeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; @@ -339,7 +339,7 @@ private Work.ProcessingContext createProcessingContext(String computationId) { computationId, getDataStream.get()::requestKeyedData, workCommitter.get()::commit, - DirectHeartbeatSender.create(getDataStream.get())); + FixedStreamHeartbeatSender.create(getDataStream.get())); } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java index 9f115ea26e813..5600a8f0f413b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; +import java.util.Collection; import java.util.Deque; import java.util.List; import java.util.Map; @@ -197,11 +198,11 @@ public GlobalData requestGlobalData(GlobalDataRequest request) { } @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map> heartbeats) { StreamingGetDataRequest.Builder builder = StreamingGetDataRequest.newBuilder(); if (sendKeyedGetDataRequests) { long builderBytes = 0; - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { for (HeartbeatRequest request : entry.getValue()) { // Calculate the bytes with some overhead for proto encoding. long bytes = (long) entry.getKey().length() + request.getSerializedSize() + 10; @@ -232,7 +233,7 @@ public void refreshActiveWork(Map> heartbeats) { } else { // No translation necessary, but we must still respect `RPC_STREAM_CHUNK_SIZE`. long builderBytes = 0; - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { ComputationHeartbeatRequest.Builder computationHeartbeatBuilder = ComputationHeartbeatRequest.newBuilder().setComputationId(entry.getKey()); for (HeartbeatRequest request : entry.getValue()) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java index 1623dfcc7d6f2..235c6b132fdbb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java @@ -200,7 +200,7 @@ public GetWorkStream createDirectGetWorkStream( Supplier workCommitter, WorkItemScheduler workItemScheduler) { return GrpcDirectGetWorkStream.create( - connection.backendWorkerToken().orElse(NO_BACKEND_WORKER_TOKEN), + connection.backendWorkerToken(), responseObserver -> withDefaultDeadline(connection.stub()).getWorkStream(responseObserver), request, grpcBackOff.get(), diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java index 66bebd0e939b4..62aa4e1c1a05f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java @@ -263,7 +263,7 @@ private void startWorkerMetadataConsumer() { @VisibleForTesting public synchronized void finish() { Preconditions.checkState(started, "StreamingEngineClient never started."); - getWorkerMetadataStream.get().close(); + getWorkerMetadataStream.get().halfClose(); getWorkBudgetRefresher.stop(); newWorkerMetadataPublisher.shutdownNow(); newWorkerMetadataConsumer.shutdownNow(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java index 16890e0b69f4a..12f8a3c7f901c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java @@ -138,10 +138,10 @@ void closeAllStreams() { // streaming RPCs by possibly making calls over the network. Do not close the streams unless // they have already been started. if (started.get()) { - getWorkStream.get().close(); - getDataStream.get().close(); + getWorkStream.get().halfClose(); + getDataStream.get().halfClose(); workCommitter.get().stop(); - commitWorkStream.get().close(); + commitWorkStream.get().halfClose(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index 334ab8efeae22..ac5deccdc200e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -397,8 +397,7 @@ private ExecuteWorkResult executeWork( computationState.releaseComputationWorkExecutor(computationWorkExecutor); work.setState(Work.State.COMMIT_QUEUED); - outputBuilder.addAllPerWorkItemLatencyAttributions( - work.getLatencyAttributions(false, sampler)); + outputBuilder.addAllPerWorkItemLatencyAttributions(work.getLatencyAttributions(sampler)); return ExecuteWorkResult.create( outputBuilder, stateReader.getBytesRead() + localSideInputStateFetcher.getBytesRead()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 11197c74d21f4..7f8e3bee66fef 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -17,10 +17,10 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; -import java.util.ArrayList; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap.toImmutableMap; + import java.util.Collection; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -30,9 +30,8 @@ import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -56,7 +55,7 @@ public final class ActiveWorkRefresher { private final DataflowExecutionStateSampler sampler; private final int stuckCommitDurationMillis; private final ScheduledExecutorService activeWorkRefreshExecutor; - private final Consumer> heartbeatSender; + private final Consumer> heartbeatSender; public ActiveWorkRefresher( Supplier clock, @@ -65,7 +64,7 @@ public ActiveWorkRefresher( Supplier> computations, DataflowExecutionStateSampler sampler, ScheduledExecutorService activeWorkRefreshExecutor, - Consumer> heartbeatSender) { + Consumer> heartbeatSender) { this.clock = clock; this.activeWorkRefreshPeriodMillis = activeWorkRefreshPeriodMillis; this.stuckCommitDurationMillis = stuckCommitDurationMillis; @@ -75,6 +74,16 @@ public ActiveWorkRefresher( this.heartbeatSender = heartbeatSender; } + private static Windmill.HeartbeatRequest createHeartbeatRequest( + RefreshableWork work, DataflowExecutionStateSampler sampler) { + return Windmill.HeartbeatRequest.newBuilder() + .setShardingKey(work.getShardedKey().shardingKey()) + .setWorkToken(work.id().workToken()) + .setCacheToken(work.id().cacheToken()) + .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) + .build(); + } + @SuppressWarnings("FutureReturnValueIgnored") public void start() { if (activeWorkRefreshPeriodMillis > 0) { @@ -119,32 +128,21 @@ private void invalidateStuckCommits() { private void refreshActiveWork() { Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); - Map fannedOutHeartbeatRequests = new HashMap<>(); + Map heartbeatsBySender = new HashMap<>(); + // Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. for (ComputationState computationState : computations.get()) { - String computationId = computationState.getComputationId(); - - // Get heartbeat requests for computation's current active work, aggregated by GetDataStream - // to correctly fan-out the heartbeat requests. - Table heartbeats = - HeartbeatRequests.getRefreshableKeyHeartbeats( - computationState.currentActiveWorkReadOnly(), refreshDeadline, sampler); - - // Aggregate the heartbeats across computations by GetDataStream for correct fan out. - for (Table.Cell heartbeatsPerStream : - heartbeats.cellSet()) { - Heartbeat heartbeat = - fannedOutHeartbeatRequests.computeIfAbsent( - heartbeatsPerStream.getRowKey(), ignored -> Heartbeat.create()); - heartbeat.work().add(heartbeatsPerStream.getColumnKey()); - List existingHeartbeatsForComputation = - heartbeat - .heartbeatRequests() - .computeIfAbsent(computationId, ignored -> new ArrayList<>()); - existingHeartbeatsForComputation.add(heartbeatsPerStream.getValue()); + for (RefreshableWork work : computationState.getRefreshableWork(refreshDeadline)) { + heartbeatsBySender + .computeIfAbsent(work.heartbeatSender(), ignored -> Heartbeats.builder()) + .addWork(work) + .addHeartbeatRequest( + computationState.getComputationId(), createHeartbeatRequest(work, sampler)); } } - heartbeatSender.accept(fannedOutHeartbeatRequests); + heartbeatSender.accept( + heartbeatsBySender.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build()))); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java index ab32108ae5bae..b0f7144338050 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java @@ -17,7 +17,7 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; -import java.util.List; +import java.util.Collection; import java.util.Map; import java.util.function.Consumer; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; @@ -37,11 +37,11 @@ public ApplianceHeartbeatSender(Consumer sendHeartbeatF * must translate the HeartbeatRequest to a KeyedGetDataRequest here. */ @Override - public void sendHeartbeats(Heartbeat heartbeats) { + public void sendHeartbeats(Heartbeats heartbeats) { Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Map.Entry> entry : - heartbeats.heartbeatRequests().entrySet()) { + for (Map.Entry> entry : + heartbeats.heartbeatRequests().asMap().entrySet()) { Windmill.ComputationGetDataRequest.Builder perComputationBuilder = Windmill.ComputationGetDataRequest.newBuilder(); perComputationBuilder.setComputationId(entry.getKey()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java similarity index 74% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java index 0c5b16ad1b894..7da48d4f0218d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DirectHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java @@ -17,6 +17,7 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; +import java.util.Objects; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.sdk.annotations.Internal; @@ -31,22 +32,23 @@ *

{@link #equals(Object)} and {@link #hashCode()} implementations delegate to internal * {@link GetDataStream} implementations so that requests can be grouped and sent on the same * stream instance. + *

This class is a stateless decorator to the underlying stream. */ @Internal -public final class DirectHeartbeatSender implements HeartbeatSender { - private static final Logger LOG = LoggerFactory.getLogger(DirectHeartbeatSender.class); +public final class FixedStreamHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(FixedStreamHeartbeatSender.class); private final GetDataStream getDataStream; - private DirectHeartbeatSender(GetDataStream getDataStream) { + private FixedStreamHeartbeatSender(GetDataStream getDataStream) { this.getDataStream = getDataStream; } - public static DirectHeartbeatSender create(GetDataStream getDataStream) { - return new DirectHeartbeatSender(getDataStream); + public static FixedStreamHeartbeatSender create(GetDataStream getDataStream) { + return new FixedStreamHeartbeatSender(getDataStream); } @Override - public void sendHeartbeats(Heartbeat heartbeats) { + public void sendHeartbeats(Heartbeats heartbeats) { if (getDataStream.isShutdown()) { LOG.warn( "Trying to refresh work w/ {} heartbeats on stream={} after work has moved off of worker." @@ -55,18 +57,18 @@ public void sendHeartbeats(Heartbeat heartbeats) { heartbeats.heartbeatRequests().size()); heartbeats.work().forEach(RefreshableWork::setFailed); } else { - getDataStream.refreshActiveWork(heartbeats.heartbeatRequests()); + getDataStream.refreshActiveWork(heartbeats.heartbeatRequests().asMap()); } } @Override public int hashCode() { - return getDataStream.hashCode(); + return Objects.hash(FixedStreamHeartbeatSender.class, getDataStream); } @Override public boolean equals(Object obj) { - return obj instanceof DirectHeartbeatSender - && getDataStream.equals(((DirectHeartbeatSender) obj).getDataStream); + return obj instanceof FixedStreamHeartbeatSender + && getDataStream.equals(((FixedStreamHeartbeatSender) obj).getDataStream); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java deleted file mode 100644 index 8cc7d50dd48e6..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeat.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import com.google.auto.value.AutoValue; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; - -/** Heartbeat requests and the work that was used to generate the heartbeat requests. */ -@AutoValue -public abstract class Heartbeat { - - static Heartbeat create() { - return new AutoValue_Heartbeat(new HashSet<>(), new HashMap<>()); - } - - abstract Collection work(); - - public abstract Map> heartbeatRequests(); -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java deleted file mode 100644 index 792548f93b420..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequests.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableTable.toImmutableTable; - -import com.google.auto.value.AutoValue; -import java.util.Collection; -import java.util.Map; -import java.util.stream.Stream; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.streaming.ShardedKey; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; -import org.joda.time.Instant; - -/** Helper factory class for creating heartbeat requests. */ -@Internal -public final class HeartbeatRequests { - - private HeartbeatRequests() {} - - static Table getRefreshableKeyHeartbeats( - ImmutableListMultimap activeWork, - Instant refreshDeadline, - DataflowExecutionStateSampler sampler) { - return activeWork.asMap().entrySet().stream() - .flatMap(e -> toHeartbeatRow(e, refreshDeadline, sampler)) - .collect(toImmutableTable(HeartbeatRow::sender, HeartbeatRow::work, HeartbeatRow::request)); - } - - private static Stream toHeartbeatRow( - Map.Entry> shardedKeyAndWorkQueue, - Instant refreshDeadline, - DataflowExecutionStateSampler sampler) { - ShardedKey shardedKey = shardedKeyAndWorkQueue.getKey(); - Collection workQueue = shardedKeyAndWorkQueue.getValue(); - return workQueue.stream() - .filter(work -> work.isRefreshable(refreshDeadline)) - .map(work -> HeartbeatRow.create(work, createHeartbeatRequest(shardedKey, work, sampler))); - } - - private static HeartbeatRequest createHeartbeatRequest( - ShardedKey shardedKey, RefreshableWork work, DataflowExecutionStateSampler sampler) { - return HeartbeatRequest.newBuilder() - .setShardingKey(shardedKey.shardingKey()) - .setWorkToken(work.id().workToken()) - .setCacheToken(work.id().cacheToken()) - .addAllLatencyAttribution(work.getLatencyAttributions(/* isHeartbeat= */ true, sampler)) - .build(); - } - - @AutoValue - abstract static class HeartbeatRow { - - private static HeartbeatRow create(RefreshableWork work, HeartbeatRequest request) { - return new AutoValue_HeartbeatRequests_HeartbeatRow(work.heartbeatSender(), work, request); - } - - abstract HeartbeatSender sender(); - - abstract RefreshableWork work(); - - abstract HeartbeatRequest request(); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java index c35cd6891aadb..3ee0090ebcaa8 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java @@ -24,5 +24,5 @@ public interface HeartbeatSender { * Send heartbeats. Heartbeats represent WorkItem that is actively being processed belonging to * the computation. */ - void sendHeartbeats(Heartbeat heartbeats); + void sendHeartbeats(Heartbeats heartbeats); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java new file mode 100644 index 0000000000000..cff65ca183257 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import com.google.auto.value.AutoValue; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; + +/** Heartbeat requests and the work that was used to generate the heartbeat requests. */ +@AutoValue +public abstract class Heartbeats { + + static Heartbeats.Builder builder() { + return new AutoValue_Heartbeats.Builder(); + } + + abstract ImmutableList work(); + + public abstract ImmutableListMultimap heartbeatRequests(); + + public final int size() { + return heartbeatRequests().asMap().size(); + } + + @AutoValue.Builder + public abstract static class Builder { + abstract Builder setWork(ImmutableList value); + + abstract ImmutableList.Builder workBuilder(); + + public final Builder addWork(RefreshableWork work) { + workBuilder().add(work); + return this; + } + + abstract Builder setHeartbeatRequests( + ImmutableListMultimap value); + + abstract ImmutableListMultimap.Builder + heartbeatRequestsBuilder(); + + public final Builder addHeartbeatRequest( + String computationId, Windmill.HeartbeatRequest heartbeatRequest) { + heartbeatRequestsBuilder().put(computationId, heartbeatRequest); + return this; + } + + public abstract Heartbeats build(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java index 4a1a6bc49301a..e571f89f142cc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java @@ -37,10 +37,10 @@ public StreamPoolHeartbeatSender( } @Override - public void sendHeartbeats(Heartbeat heartbeats) { + public void sendHeartbeats(Heartbeats heartbeats) { try (CloseableStream closeableStream = heartbeatStreamPool.getCloseableStream()) { - closeableStream.stream().refreshActiveWork(heartbeats.heartbeatRequests()); + closeableStream.stream().refreshActiveWork(heartbeats.heartbeatRequests().asMap()); } catch (Exception e) { LOG.warn("Error occurred sending heartbeats=[{}].", heartbeats, e); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index b76a16aab573d..5406a72927393 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -28,6 +28,7 @@ import static org.junit.Assert.assertFalse; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -244,7 +245,7 @@ public boolean isShutdown() { } @Override - public void close() { + public void halfClose() { done.countDown(); } @@ -269,7 +270,7 @@ public boolean awaitTermination(int time, TimeUnit unit) throws InterruptedExcep try { sleepMillis(500); } catch (InterruptedException e) { - close(); + halfClose(); Thread.currentThread().interrupt(); } continue; @@ -355,9 +356,9 @@ public Windmill.GlobalData requestGlobalData(Windmill.GlobalDataRequest request) } @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map> heartbeats) { Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { builder.addComputationHeartbeatRequest( ComputationHeartbeatRequest.newBuilder() .setComputationId(entry.getKey()) @@ -373,7 +374,7 @@ public void onHeartbeatResponse(List responses) { } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { @@ -469,7 +470,7 @@ class RequestAndDone { } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 200a30537dae8..0889a66914495 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -3426,7 +3426,7 @@ public void testLatencyAttributionProtobufsPopulated() { clock.sleep(Duration.millis(60)); Iterator it = - work.getLatencyAttributions(false, DataflowExecutionStateSampler.instance()).iterator(); + work.getLatencyAttributions(DataflowExecutionStateSampler.instance()).iterator(); assertTrue(it.hasNext()); LatencyAttribution lat = it.next(); assertSame(State.QUEUED, lat.getState()); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index c5e24739c5d69..6db9634d4ab22 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -18,7 +18,6 @@ package org.apache.beam.runners.dataflow.worker.streaming; import static com.google.common.truth.Truth.assertThat; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; @@ -26,21 +25,17 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; -import com.google.auto.value.AutoValue; import java.util.Collections; import java.util.Deque; import java.util.HashMap; import java.util.Map; import java.util.Optional; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ActiveWorkState.ActivateWorkResult; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; import org.junit.Before; import org.junit.Rule; @@ -449,70 +444,4 @@ public void testActivateWorkForKey_matchingCacheTokens_newWorkTokenLesser_STALE( assertFalse(readOnlyActiveWork.get(shardedKey).contains(newWork)); assertEquals(queuedWork, readOnlyActiveWork.get(shardedKey).peek()); } - - @Test - public void testGetKeyHeartbeats() { - Instant refreshDeadline = Instant.now(); - ShardedKey shardedKey1 = shardedKey("someKey", 1L); - ShardedKey shardedKey2 = shardedKey("anotherKey", 2L); - - ExecutableWork freshWork = createWork(createWorkItem(3L, 3L, shardedKey1)); - ExecutableWork refreshableWork1 = expiredWork(createWorkItem(1L, 1L, shardedKey1)); - refreshableWork1.work().setState(Work.State.COMMITTING); - ExecutableWork refreshableWork2 = expiredWork(createWorkItem(2L, 2L, shardedKey2)); - refreshableWork2.work().setState(Work.State.COMMITTING); - - activeWorkState.activateWorkForKey(refreshableWork1); - activeWorkState.activateWorkForKey(freshWork); - activeWorkState.activateWorkForKey(refreshableWork2); - - ImmutableList requests = - activeWorkState.getKeyHeartbeats(refreshDeadline, DataflowExecutionStateSampler.instance()); - - ImmutableList expected = - ImmutableList.of( - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( - shardedKey1, refreshableWork1.work()), - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( - shardedKey2, refreshableWork2.work())); - - ImmutableList actual = - requests.stream() - .map(HeartbeatRequestShardingKeyWorkTokenAndCacheToken::from) - .collect(toImmutableList()); - - assertThat(actual).containsExactlyElementsIn(expected); - } - - @AutoValue - abstract static class HeartbeatRequestShardingKeyWorkTokenAndCacheToken { - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken create( - long shardingKey, long workToken, long cacheToken) { - return new AutoValue_ActiveWorkStateTest_HeartbeatRequestShardingKeyWorkTokenAndCacheToken( - shardingKey, workToken, cacheToken); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - HeartbeatRequest heartbeatRequest) { - return create( - heartbeatRequest.getShardingKey(), - heartbeatRequest.getWorkToken(), - heartbeatRequest.getCacheToken()); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - ShardedKey shardedKey, Work work) { - return create( - shardedKey.shardingKey(), - work.getWorkItem().getWorkToken(), - work.getWorkItem().getCacheToken()); - } - - abstract long shardingKey(); - - abstract long workToken(); - - abstract long cacheToken(); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java index fec01fc37c3a3..7e5b350b48323 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java @@ -237,7 +237,7 @@ private TestWindmillStream(Instant startTime) { } @Override - public void close() { + public void halfClose() { closed = true; } @@ -258,7 +258,7 @@ public String backendWorkerToken() { @Override public void shutdown() { - close(); + halfClose(); } @Override diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java index 3de55684caa85..6c44d3d954e24 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java @@ -275,7 +275,7 @@ public void flush() {} } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java index 515beba0c88d4..4439c409b32f8 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java @@ -261,7 +261,7 @@ public void testGetWorkerMetadata_correctlyAddsAndRemovesStreamFromRegistry() { .build()); assertTrue(streamRegistry.contains(stream)); - stream.close(); + stream.halfClose(); assertFalse(streamRegistry.contains(stream)); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java index 6473d5527a815..5cfc19ac07dfd 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java @@ -110,14 +110,13 @@ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) }) public class GrpcWindmillServerTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); - @Rule public GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - @Rule public ErrorCollector errorCollector = new ErrorCollector(); - private static final Logger LOG = LoggerFactory.getLogger(GrpcWindmillServerTest.class); private static final int STREAM_CHUNK_SIZE = 2 << 20; private final long clientId = 10L; private final MutableHandlerRegistry serviceRegistry = new MutableHandlerRegistry(); + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); + @Rule public GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + @Rule public ErrorCollector errorCollector = new ErrorCollector(); private Server server; private GrpcWindmillServer client; private int remainingErrors = 20; @@ -329,7 +328,7 @@ public void onCompleted() { }); assertTrue(latch.await(30, TimeUnit.SECONDS)); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } @@ -490,7 +489,7 @@ private void flushResponse() { }); } done.await(); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); executor.shutdown(); } @@ -688,7 +687,7 @@ public StreamObserver commitWorkStream( // Make the commit requests, waiting for each of them to be verified and acknowledged. CommitWorkStream stream = client.commitWorkStream(); commitWorkTestHelper(stream, commitRequests, 0, 500); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } @@ -723,7 +722,7 @@ public StreamObserver commitWorkStream( for (Future f : futures) { f.get(); } - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); executor.shutdown(); } @@ -825,7 +824,7 @@ public void onCompleted() { } } - stream.close(); + stream.halfClose(); isClientClosed.set(true); deadline = System.currentTimeMillis() + 60_000; // 1 min @@ -957,13 +956,13 @@ public void onCompleted() { Map> expectedKeyedGetDataRequests = new HashMap<>(); expectedKeyedGetDataRequests.put("Computation1", makeGetDataHeartbeatRequest(computation1Keys)); expectedKeyedGetDataRequests.put("Computation2", makeGetDataHeartbeatRequest(computation2Keys)); - Map> heartbeatsToRefresh = new HashMap<>(); + Map> heartbeatsToRefresh = new HashMap<>(); heartbeatsToRefresh.put("Computation1", makeHeartbeatRequest(computation1Keys)); heartbeatsToRefresh.put("Computation2", makeHeartbeatRequest(computation2Keys)); GetDataStream stream = client.getDataStream(); stream.refreshActiveWork(heartbeatsToRefresh); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); boolean receivedAllGetDataHeartbeats = false; @@ -1058,13 +1057,13 @@ public void onCompleted() { } expectedHeartbeats.add(comp1Builder.build()); expectedHeartbeats.add(comp2Builder.build()); - Map> heartbeatRequestMap = new HashMap<>(); + Map> heartbeatRequestMap = new HashMap<>(); heartbeatRequestMap.put("Computation1", makeHeartbeatRequest(computation1Keys)); heartbeatRequestMap.put("Computation2", makeHeartbeatRequest(computation2Keys)); GetDataStream stream = client.getDataStream(); stream.refreshActiveWork(heartbeatRequestMap); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); boolean receivedAllHeartbeatRequests = false; @@ -1185,7 +1184,7 @@ public void onCompleted() { // actually report more due to backoff in restarting streams. assertTrue(this.client.getAndResetThrottleTime() > throttleTime); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java index a515ff4161a47..9017b673c0088 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java @@ -226,9 +226,9 @@ public void testCloseAllStreams_closesAllStreams() { windmillStreamSender.startStreams(); windmillStreamSender.closeAllStreams(); - verify(mockGetWorkStream).close(); - verify(mockGetDataStream).close(); - verify(mockCommitWorkStream).close(); + verify(mockGetWorkStream).halfClose(); + verify(mockGetDataStream).halfClose(); + verify(mockCommitWorkStream).halfClose(); } private WindmillStreamSender newWindmillStreamSender(GetWorkBudget budget) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index 845687b457b08..d6052270966cf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -98,7 +98,7 @@ private ActiveWorkRefresher createActiveWorkRefresher( int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, - Consumer> activeWorkRefresherFn) { + Consumer> activeWorkRefresherFn) { return new ActiveWorkRefresher( clock, activeWorkRefreshPeriodMillis, @@ -164,7 +164,7 @@ public void testActiveWorkRefresh() throws InterruptedException { activeWorkForComputation.add(fakeWork); } - Map fanoutExpectedHeartbeats = new HashMap<>(); + Map fanoutExpectedHeartbeats = new HashMap<>(); CountDownLatch heartbeatsSent = new CountDownLatch(1); TestClock fakeClock = new TestClock(Instant.now()); ActiveWorkRefresher activeWorkRefresher = @@ -186,12 +186,12 @@ public void testActiveWorkRefresh() throws InterruptedException { assertThat(computationsAndWork.size()) .isEqualTo( Iterables.getOnlyElement(fanoutExpectedHeartbeats.values()).heartbeatRequests().size()); - for (Map.Entry fanOutExpectedHeartbeat : + for (Map.Entry fanOutExpectedHeartbeat : fanoutExpectedHeartbeats.entrySet()) { - for (Map.Entry> expectedHeartbeat : - fanOutExpectedHeartbeat.getValue().heartbeatRequests().entrySet()) { + for (Map.Entry> expectedHeartbeat : + fanOutExpectedHeartbeat.getValue().heartbeatRequests().asMap().entrySet()) { String computationId = expectedHeartbeat.getKey(); - List heartbeatRequests = expectedHeartbeat.getValue(); + Collection heartbeatRequests = expectedHeartbeat.getValue(); List work = computationsAndWork.get(computationId).stream() .map(ExecutableWork::work) diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java deleted file mode 100644 index 1f0a12383ea40..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatRequestsTest.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import static com.google.common.truth.Truth.assertThat; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; - -import com.google.auto.value.AutoValue; -import java.util.ArrayDeque; -import java.util.Collections; -import java.util.Deque; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.streaming.ShardedKey; -import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; -import org.apache.beam.runners.dataflow.worker.streaming.Work; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; -import org.joda.time.Instant; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -@RunWith(JUnit4.class) -public class HeartbeatRequestsTest { - - private Map> activeWork; - - private static Work createWork(Windmill.WorkItem workItem, HeartbeatSender heartbeatSender) { - return Work.create( - workItem, - Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), - createProcessingContext(heartbeatSender), - Instant::now, - Collections.emptyList()); - } - - private static ShardedKey shardedKey(String str, long shardKey) { - return ShardedKey.create(ByteString.copyFromUtf8(str), shardKey); - } - - private static Work.ProcessingContext createProcessingContext(HeartbeatSender heartbeatSender) { - return Work.createProcessingContext( - "computationId", - (computationId, request) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}, - heartbeatSender); - } - - private static Work expiredWork(Windmill.WorkItem workItem, HeartbeatSender heartbeatSender) { - return Work.create( - workItem, - Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), - createProcessingContext(heartbeatSender), - () -> Instant.EPOCH, - Collections.emptyList()); - } - - private static Windmill.WorkItem createWorkItem(long workToken, long cacheToken) { - return Windmill.WorkItem.newBuilder() - .setKey(ByteString.copyFromUtf8("")) - .setShardingKey(1) - .setWorkToken(workToken) - .setCacheToken(cacheToken) - .build(); - } - - @Before - public void setUp() { - activeWork = new HashMap<>(); - } - - @Test - public void testGetRefreshableFanoutKeyHeartbeats() { - Instant refreshDeadline = Instant.now(); - HeartbeatSender sender1 = ignored -> {}; - HeartbeatSender sender2 = ignored -> {}; - - Work freshWork = createWork(createWorkItem(3L, 3L), sender1); - Work refreshableWork1 = expiredWork(createWorkItem(1L, 1L), sender1); - refreshableWork1.setState(Work.State.COMMITTING); - Work refreshableWork2 = expiredWork(createWorkItem(2L, 2L), sender2); - refreshableWork2.setState(Work.State.COMMITTING); - ShardedKey shardedKey1 = shardedKey("someKey", 1L); - ShardedKey shardedKey2 = shardedKey("anotherKey", 2L); - - activateWorkForKey(shardedKey1, refreshableWork1); - activateWorkForKey(shardedKey1, freshWork); - activateWorkForKey(shardedKey2, refreshableWork2); - - Table requests = - HeartbeatRequests.getRefreshableKeyHeartbeats( - currentActiveWork(), refreshDeadline, DataflowExecutionStateSampler.instance()); - - ImmutableList expected = - ImmutableList.of( - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from(shardedKey1, refreshableWork1), - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from(shardedKey2, refreshableWork2)); - - ImmutableList actual = - requests.cellSet().stream() - .map( - entry -> - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( - entry.getValue(), entry.getRowKey())) - .collect(toImmutableList()); - - assertThat(actual).containsExactlyElementsIn(expected); - } - - private void activateWorkForKey(ShardedKey shardedKey, Work work) { - Deque workQueue = activeWork.computeIfAbsent(shardedKey, ignored -> new ArrayDeque<>()); - workQueue.addLast(work); - } - - private ImmutableListMultimap currentActiveWork() { - ImmutableListMultimap.Builder currentActiveWork = - ImmutableListMultimap.builder(); - - for (Map.Entry> keyedWorkQueues : activeWork.entrySet()) { - currentActiveWork.putAll( - keyedWorkQueues.getKey(), - keyedWorkQueues.getValue().stream() - .map(Work::refreshableView) - .collect(Collectors.toList())); - } - - return currentActiveWork.build(); - } - - @AutoValue - abstract static class HeartbeatRequestShardingKeyWorkTokenAndCacheToken { - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken create( - long shardingKey, long workToken, long cacheToken, HeartbeatSender sender) { - return new AutoValue_HeartbeatRequestsTest_HeartbeatRequestShardingKeyWorkTokenAndCacheToken( - shardingKey, workToken, cacheToken, sender); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - ShardedKey shardedKey, Work work) { - return create( - shardedKey.shardingKey(), - work.getWorkItem().getWorkToken(), - work.getWorkItem().getCacheToken(), - work.heartbeatSender()); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - Windmill.HeartbeatRequest heartbeatRequest, HeartbeatSender sender) { - return create( - heartbeatRequest.getShardingKey(), - heartbeatRequest.getWorkToken(), - heartbeatRequest.getCacheToken(), - sender); - } - - abstract long shardingKey(); - - abstract long workToken(); - - abstract long cacheToken(); - - abstract HeartbeatSender heartbeatSender(); - - @Override - public final boolean equals(Object obj) { - if (!(obj instanceof HeartbeatRequestShardingKeyWorkTokenAndCacheToken)) { - return false; - } - HeartbeatRequestShardingKeyWorkTokenAndCacheToken other = - (HeartbeatRequestShardingKeyWorkTokenAndCacheToken) obj; - return shardingKey() == other.shardingKey() - && workToken() == other.workToken() - && cacheToken() == other.cacheToken(); - } - - @Override - public final int hashCode() { - return Objects.hash(shardingKey(), workToken(), cacheToken()); - } - } -} From 469387012e585f25d355f88022423568d62d4a70 Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Fri, 12 Jul 2024 22:53:52 +0900 Subject: [PATCH 4/7] move side input state API into Work instance w/ keyed state fetching. Future proofs this logic for direct path. --- .../worker/StreamingDataflowWorker.java | 7 +- .../dataflow/worker/streaming/Work.java | 29 +++-- .../sideinput/SideInputStateFetcher.java | 112 ++++++++---------- .../SideInputStateFetcherFactory.java | 46 +++++++ .../client/getdata/DirectGetDataClient.java | 16 +-- .../client/grpc/GrpcDirectGetWorkStream.java | 23 ++-- .../grpc/GrpcWindmillStreamFactory.java | 8 +- .../client/grpc/StreamingEngineClient.java | 24 +++- .../client/grpc/WindmillStreamSender.java | 14 ++- .../processing/StreamingWorkScheduler.java | 12 +- .../worker/StreamingDataflowWorkerTest.java | 19 ++- .../StreamingModeExecutionContextTest.java | 21 +++- .../worker/WorkerCustomSourcesTest.java | 23 +++- .../worker/streaming/ActiveWorkStateTest.java | 21 +++- .../streaming/ComputationStateCacheTest.java | 18 ++- .../sideinput/SideInputStateFetcherTest.java | 89 +++++++------- .../worker/util/BoundedQueueExecutorTest.java | 18 ++- .../StreamingApplianceWorkCommitterTest.java | 18 ++- .../StreamingEngineWorkCommitterTest.java | 18 ++- .../grpc/StreamingEngineClientTest.java | 21 ++-- .../client/grpc/WindmillStreamSenderTest.java | 12 +- .../EvenGetWorkBudgetDistributorTest.java | 2 + .../failures/WorkFailureProcessorTest.java | 18 ++- .../work/refresh/ActiveWorkRefresherTest.java | 21 +++- 24 files changed, 415 insertions(+), 195 deletions(-) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index b2b4b0796db59..58e7a370366fc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -865,10 +865,7 @@ private void dispatchLoop() { workItem, watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), Work.createProcessingContext( - computationId, - getDataClient::getStateData, - workCommitter::commit, - heartbeatSender), + computationId, getDataClient, workCommitter::commit, heartbeatSender), /* getWorkStreamLatencies= */ Collections.emptyList()); } } @@ -904,7 +901,7 @@ void streamingDispatchLoop() { .build(), Work.createProcessingContext( computationState.getComputationId(), - getDataClient::getStateData, + getDataClient, workCommitter::commit, heartbeatSender), getWorkStreamLatencies); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index 6a58ef93069fe..71ffd98ac1c03 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -27,14 +27,14 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Optional; -import java.util.function.BiFunction; import java.util.function.Consumer; -import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Pair; import org.apache.beam.runners.dataflow.worker.ActiveMessageMetadata; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; @@ -45,6 +45,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.Commit; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; @@ -106,10 +107,10 @@ public static Work create( public static ProcessingContext createProcessingContext( String computationId, - BiFunction getKeyedDataFn, + GetDataClient getDataClient, Consumer workCommitter, HeartbeatSender heartbeatSender) { - return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter, heartbeatSender); + return ProcessingContext.create(computationId, getDataClient, workCommitter, heartbeatSender); } private static LatencyAttribution.Builder createLatencyAttributionWithActiveLatencyBreakdown( @@ -163,7 +164,11 @@ public ShardedKey getShardedKey() { } public Optional fetchKeyedState(KeyedGetDataRequest keyedGetDataRequest) { - return processingContext.keyedDataFetcher().apply(keyedGetDataRequest); + return processingContext.fetchKeyedState(keyedGetDataRequest); + } + + public GlobalData fetchSideInput(GlobalDataRequest request) { + return processingContext.getDataClient().getSideInputData(request); } public Watermarks watermarks() { @@ -344,22 +349,18 @@ public abstract static class ProcessingContext { private static ProcessingContext create( String computationId, - BiFunction getKeyedDataFn, + GetDataClient getDataClient, Consumer workCommitter, HeartbeatSender heartbeatSender) { return new AutoValue_Work_ProcessingContext( - computationId, - request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request)), - heartbeatSender, - workCommitter); + computationId, getDataClient, heartbeatSender, workCommitter); } /** Computation that the {@link Work} belongs to. */ public abstract String computationId(); /** Handles GetData requests to streaming backend. */ - public abstract Function> - keyedDataFetcher(); + public abstract GetDataClient getDataClient(); public abstract HeartbeatSender heartbeatSender(); @@ -368,5 +369,9 @@ private static ProcessingContext create( * {@link WorkItem}. */ public abstract Consumer workCommitter(); + + private Optional fetchKeyedState(KeyedGetDataRequest request) { + return Optional.ofNullable(getDataClient().getStateData(computationId(), request)); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java index 7fd2487575c2d..113b760556dfd 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java @@ -30,7 +30,6 @@ import java.util.function.Function; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.runners.core.InMemoryMultimapSideInputView; -import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; @@ -46,13 +45,12 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.WindowingStrategy; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Class responsible for fetching state from the windmill server. */ +/** Class responsible for fetching side input state from the streaming backend. */ @NotThreadSafe public class SideInputStateFetcher { private static final Logger LOG = LoggerFactory.getLogger(SideInputStateFetcher.class); @@ -64,13 +62,6 @@ public class SideInputStateFetcher { private final Function fetchGlobalDataFn; private long bytesRead = 0L; - public SideInputStateFetcher( - Function fetchGlobalDataFn, - DataflowStreamingPipelineOptions options) { - this(fetchGlobalDataFn, SideInputCache.create(options)); - } - - @VisibleForTesting SideInputStateFetcher( Function fetchGlobalDataFn, SideInputCache sideInputCache) { this.fetchGlobalDataFn = fetchGlobalDataFn; @@ -103,12 +94,56 @@ private static Coder getCoder(PCollectionView view) { return view.getCoderInternal(); } - /** Returns a view of the underlying cache that keeps track of bytes read separately. */ - public SideInputStateFetcher byteTrackingView() { - return new SideInputStateFetcher(fetchGlobalDataFn, sideInputCache); + private static SideInput createSideInputCacheEntry( + PCollectionView view, GlobalData data) throws IOException { + Iterable rawData = decodeRawData(view, data); + switch (getViewFn(view).getMaterialization().getUrn()) { + case ITERABLE_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // ITERABLE_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + return SideInput.ready(viewFn.apply(() -> rawData), data.getData().size()); + } + case MULTIMAP_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // MULTIMAP_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + Coder keyCoder = ((KvCoder) getCoder(view)).getKeyCoder(); + + @SuppressWarnings({ + "unchecked", // Safe since multimap rawData is of type Iterable> + "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) + }) + T multimapSideInputValue = + viewFn.apply( + InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)); + return SideInput.ready(multimapSideInputValue, data.getData().size()); + } + default: + { + throw new IllegalStateException( + "Unknown side input materialization format requested: " + + getViewFn(view).getMaterialization().getUrn()); + } + } } - public long getBytesRead() { + private static void validateViewMaterialization(PCollectionView view) { + String materializationUrn = getViewFn(view).getMaterialization().getUrn(); + checkState( + SUPPORTED_MATERIALIZATIONS.contains(materializationUrn), + "Only materialization's of type %s supported, received %s", + SUPPORTED_MATERIALIZATIONS, + materializationUrn); + } + + public final long getBytesRead() { return bytesRead; } @@ -200,53 +235,4 @@ private SideInput loadSideInputFromWindmill( bytesRead += data.getSerializedSize(); return data.getIsReady() ? createSideInputCacheEntry(view, data) : SideInput.notReady(); } - - private void validateViewMaterialization(PCollectionView view) { - String materializationUrn = getViewFn(view).getMaterialization().getUrn(); - checkState( - SUPPORTED_MATERIALIZATIONS.contains(materializationUrn), - "Only materialization's of type %s supported, received %s", - SUPPORTED_MATERIALIZATIONS, - materializationUrn); - } - - private SideInput createSideInputCacheEntry(PCollectionView view, GlobalData data) - throws IOException { - Iterable rawData = decodeRawData(view, data); - switch (getViewFn(view).getMaterialization().getUrn()) { - case ITERABLE_MATERIALIZATION_URN: - { - @SuppressWarnings({ - "unchecked", // ITERABLE_MATERIALIZATION_URN has ViewFn. - "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) - }) - ViewFn viewFn = (ViewFn) getViewFn(view); - return SideInput.ready(viewFn.apply(() -> rawData), data.getData().size()); - } - case MULTIMAP_MATERIALIZATION_URN: - { - @SuppressWarnings({ - "unchecked", // MULTIMAP_MATERIALIZATION_URN has ViewFn. - "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) - }) - ViewFn viewFn = (ViewFn) getViewFn(view); - Coder keyCoder = ((KvCoder) getCoder(view)).getKeyCoder(); - - @SuppressWarnings({ - "unchecked", // Safe since multimap rawData is of type Iterable> - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) - }) - T multimapSideInputValue = - viewFn.apply( - InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)); - return SideInput.ready(multimapSideInputValue, data.getData().size()); - } - default: - { - throw new IllegalStateException( - "Unknown side input materialization format requested: " - + getViewFn(view).getMaterialization().getUrn()); - } - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java new file mode 100644 index 0000000000000..fd42b9ff18011 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +import java.util.function.Function; +import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.sdk.annotations.Internal; + +/** + * Factory class for generating {@link SideInputStateFetcher} instances that share a {@link + * SideInputCache}. + */ +@Internal +public final class SideInputStateFetcherFactory { + private final SideInputCache globalSideInputCache; + + private SideInputStateFetcherFactory(SideInputCache globalSideInputCache) { + this.globalSideInputCache = globalSideInputCache; + } + + public static SideInputStateFetcherFactory fromOptions(DataflowStreamingPipelineOptions options) { + return new SideInputStateFetcherFactory(SideInputCache.create(options)); + } + + public SideInputStateFetcher createSideInputStateFetcher( + Function fetchGlobalDataFn) { + return new SideInputStateFetcher(fetchGlobalDataFn, globalSideInputCache); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java index d9490f8bc7532..6ee86b6ae7241 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java @@ -17,7 +17,7 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; -import java.util.function.Supplier; +import java.util.function.Function; import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; @@ -28,23 +28,24 @@ public final class DirectGetDataClient implements GetDataClient { private final GetDataStream directGetDataStream; - private final Supplier sideInputGetDataStream; + private final Function sideInputGetDataStreamFactory; private final ThrottlingGetDataMetricTracker getDataMetricTracker; private DirectGetDataClient( GetDataStream directGetDataStream, - Supplier sideInputGetDataStream, + Function sideInputGetDataStreamFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { this.directGetDataStream = directGetDataStream; - this.sideInputGetDataStream = sideInputGetDataStream; + this.sideInputGetDataStreamFactory = sideInputGetDataStreamFactory; this.getDataMetricTracker = getDataMetricTracker; } public static GetDataClient create( GetDataStream getDataStream, - Supplier sideInputGetDataStream, + Function sideInputGetDataStreamFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { - return new DirectGetDataClient(getDataStream, sideInputGetDataStream, getDataMetricTracker); + return new DirectGetDataClient( + getDataStream, sideInputGetDataStreamFactory, getDataMetricTracker); } @Override @@ -74,7 +75,8 @@ public Windmill.KeyedGetDataResponse getStateData( @Override public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - GetDataStream sideInputGetDataStream = this.sideInputGetDataStream.get(); + GetDataStream sideInputGetDataStream = + sideInputGetDataStreamFactory.apply(request.getDataId().getTag()); if (sideInputGetDataStream.isShutdown()) { throw new GetDataException( "Error occurred fetching side input for tag=" + request.getDataId()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 19401866d13cf..0a582ea1c6292 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -39,11 +39,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.FixedStreamHeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; @@ -81,8 +82,9 @@ public final class GrpcDirectGetWorkStream private final GetWorkRequest request; private final WorkItemScheduler workItemScheduler; private final ThrottleTimer getWorkThrottleTimer; - private final Supplier getDataStream; + private final Supplier heartbeatSender; private final Supplier workCommitter; + private final Supplier getDataClient; /** * Map of stream IDs to their buffers. Used to aggregate streaming gRPC response chunks as they @@ -103,7 +105,8 @@ private GrpcDirectGetWorkStream( Set> streamRegistry, int logEveryNStreamFailures, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { super( @@ -119,8 +122,9 @@ private GrpcDirectGetWorkStream( this.workItemBuffers = new ConcurrentHashMap<>(); // Use the same GetDataStream and CommitWorkStream instances to process all the work in this // stream. - this.getDataStream = Suppliers.memoize(getDataStream::get); + this.heartbeatSender = Suppliers.memoize(heartbeatSender::get); this.workCommitter = Suppliers.memoize(workCommitter::get); + this.getDataClient = Suppliers.memoize(getDataClient::get); this.inFlightBudget = new AtomicReference<>(GetWorkBudget.noBudget()); this.nextBudgetAdjustment = new AtomicReference<>(GetWorkBudget.noBudget()); this.pendingResponseBudget = new AtomicReference<>(GetWorkBudget.noBudget()); @@ -138,7 +142,8 @@ public static GrpcDirectGetWorkStream create( Set> streamRegistry, int logEveryNStreamFailures, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { GrpcDirectGetWorkStream getWorkStream = @@ -151,7 +156,8 @@ public static GrpcDirectGetWorkStream create( streamRegistry, logEveryNStreamFailures, getWorkThrottleTimer, - getDataStream, + heartbeatSender, + getDataClient, workCommitter, workItemScheduler); getWorkStream.startStream(); @@ -336,10 +342,7 @@ private void runAndReset() { private Work.ProcessingContext createProcessingContext(String computationId) { return Work.createProcessingContext( - computationId, - getDataStream.get()::requestKeyedData, - workCommitter.get()::commit, - FixedStreamHeartbeatSender.create(getDataStream.get())); + computationId, getDataClient.get(), workCommitter.get()::commit, heartbeatSender.get()); } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java index 235c6b132fdbb..92f031db99722 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java @@ -45,10 +45,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.sdk.util.FluentBackoff; @@ -196,7 +198,8 @@ public GetWorkStream createDirectGetWorkStream( WindmillConnection connection, GetWorkRequest request, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { return GrpcDirectGetWorkStream.create( @@ -208,7 +211,8 @@ public GetWorkStream createDirectGetWorkStream( streamRegistry, logEveryNStreamFailures, getWorkThrottleTimer, - getDataStream, + heartbeatSender, + getDataClient, workCommitter, workItemScheduler); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java index 62aa4e1c1a05f..01fb6381cd4ae 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java @@ -45,6 +45,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.DirectGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; @@ -91,6 +93,7 @@ public final class StreamingEngineClient { private final Supplier getWorkerMetadataStream; private final Queue newWindmillEndpoints; private final Function workCommitterFactory; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; /** Writes are guarded by synchronization, reads are lock free. */ private final AtomicReference connections; @@ -107,8 +110,10 @@ private StreamingEngineClient( GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, long clientId, - Function workCommitterFactory) { + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { this.jobHeader = jobHeader; + this.getDataMetricTracker = getDataMetricTracker; this.started = false; this.streamFactory = streamFactory; this.workItemScheduler = workItemScheduler; @@ -171,7 +176,8 @@ public static StreamingEngineClient create( ChannelCachingStubFactory channelCachingStubFactory, GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, - Function workCommitterFactory) { + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { return new StreamingEngineClient( jobHeader, totalGetWorkBudget, @@ -181,7 +187,8 @@ public static StreamingEngineClient create( getWorkBudgetDistributor, dispatcherClient, /* clientId= */ new Random().nextLong(), - workCommitterFactory); + workCommitterFactory, + getDataMetricTracker); } @VisibleForTesting @@ -194,7 +201,8 @@ static StreamingEngineClient forTesting( GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, long clientId, - Function workCommitterFactory) { + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { StreamingEngineClient streamingEngineClient = new StreamingEngineClient( jobHeader, @@ -205,7 +213,8 @@ static StreamingEngineClient forTesting( getWorkBudgetDistributor, dispatcherClient, clientId, - workCommitterFactory); + workCommitterFactory, + getDataMetricTracker); streamingEngineClient.start(); return streamingEngineClient; } @@ -240,7 +249,7 @@ public ImmutableSet currentWindmillEndpoints() { * Fetches {@link GetDataStream} mapped to globalDataKey if one exists, or defaults to {@link * GetDataStream} pointing to dispatcher. */ - public GetDataStream getGlobalDataStream(String globalDataKey) { + private GetDataStream getGlobalDataStream(String globalDataKey) { return Optional.ofNullable(connections.get().globalDataStreams().get(globalDataKey)) .map(Supplier::get) .orElseGet( @@ -400,6 +409,9 @@ private WindmillStreamSender createAndStartWindmillStreamSenderFor( GetWorkBudget.noBudget(), streamFactory, workItemScheduler, + getDataStream -> + DirectGetDataClient.create( + getDataStream, this::getGlobalDataStream, getDataMetricTracker), workCommitterFactory); windmillStreamSender.startStreams(); return windmillStreamSender; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java index 12f8a3c7f901c..7d09726e4b28a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java @@ -28,9 +28,11 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.StreamingEngineThrottleTimers; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.FixedStreamHeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; @@ -70,6 +72,7 @@ private WindmillStreamSender( AtomicReference getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, WorkItemScheduler workItemScheduler, + Function getDataClientFactory, Function workCommitterFactory) { this.started = new AtomicBoolean(false); this.getWorkBudget = getWorkBudget; @@ -98,7 +101,8 @@ private WindmillStreamSender( connection, withRequestBudget(getWorkRequest, getWorkBudget.get()), streamingEngineThrottleTimers.getWorkThrottleTimer(), - getDataStream, + () -> FixedStreamHeartbeatSender.create(getDataStream.get()), + () -> getDataClientFactory.apply(getDataStream.get()), workCommitter, workItemScheduler)); } @@ -109,6 +113,7 @@ public static WindmillStreamSender create( GetWorkBudget getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, WorkItemScheduler workItemScheduler, + Function getDataClientFactory, Function workCommitterFactory) { return new WindmillStreamSender( connection, @@ -116,6 +121,7 @@ public static WindmillStreamSender create( new AtomicReference<>(getWorkBudget), streamingEngineStreamFactory, workItemScheduler, + getDataClientFactory, workCommitterFactory); } @@ -138,10 +144,10 @@ void closeAllStreams() { // streaming RPCs by possibly making calls over the network. Do not close the streams unless // they have already been started. if (started.get()) { - getWorkStream.get().halfClose(); - getDataStream.get().halfClose(); + getWorkStream.get().shutdown(); + getDataStream.get().shutdown(); workCommitter.get().stop(); - commitWorkStream.get().halfClose(); + commitWorkStream.get().shutdown(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index ac5deccdc200e..0fa60da71531c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -43,6 +43,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcherFactory; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.Commit; @@ -73,7 +74,7 @@ public final class StreamingWorkScheduler { private final DataflowWorkerHarnessOptions options; private final Supplier clock; private final ComputationWorkExecutorFactory computationWorkExecutorFactory; - private final SideInputStateFetcher sideInputStateFetcher; + private final SideInputStateFetcherFactory sideInputStateFetcherFactory; private final FailureTracker failureTracker; private final WorkFailureProcessor workFailureProcessor; private final StreamingCommitFinalizer commitFinalizer; @@ -87,7 +88,7 @@ public StreamingWorkScheduler( DataflowWorkerHarnessOptions options, Supplier clock, ComputationWorkExecutorFactory computationWorkExecutorFactory, - SideInputStateFetcher sideInputStateFetcher, + SideInputStateFetcherFactory sideInputStateFetcherFactory, FailureTracker failureTracker, WorkFailureProcessor workFailureProcessor, StreamingCommitFinalizer commitFinalizer, @@ -99,7 +100,7 @@ public StreamingWorkScheduler( this.options = options; this.clock = clock; this.computationWorkExecutorFactory = computationWorkExecutorFactory; - this.sideInputStateFetcher = sideInputStateFetcher; + this.sideInputStateFetcherFactory = sideInputStateFetcherFactory; this.failureTracker = failureTracker; this.workFailureProcessor = workFailureProcessor; this.commitFinalizer = commitFinalizer; @@ -140,7 +141,7 @@ public static StreamingWorkScheduler create( options, clock, computationWorkExecutorFactory, - new SideInputStateFetcher(fetchGlobalDataFn, options), + SideInputStateFetcherFactory.fromOptions(options), failureTracker, workFailureProcessor, StreamingCommitFinalizer.create(workExecutor), @@ -347,7 +348,8 @@ private ExecuteWorkResult executeWork( try { WindmillStateReader stateReader = work.createWindmillStateReader(); - SideInputStateFetcher localSideInputStateFetcher = sideInputStateFetcher.byteTrackingView(); + SideInputStateFetcher localSideInputStateFetcher = + sideInputStateFetcherFactory.createSideInputStateFetcher(work::fetchSideInput); // If the read output KVs, then we can decode Windmill's byte key into userland // key object and provide it to the execution context for use with per-key state. diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 0889a66914495..be1e1278a767b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -126,6 +126,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.Timer.Type; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WatermarkHold; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; @@ -315,6 +316,20 @@ private static ExecutableWork createMockWork( return createMockWork(shardedKey, workToken, computationId, ignored -> {}); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public KeyedGetDataResponse getStateData(String computation, KeyedGetDataRequest request) { + return KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private static ExecutableWork createMockWork( ShardedKey shardedKey, long workToken, Consumer processWorkFn) { return createMockWork(shardedKey, workToken, "computationId", processWorkFn); @@ -332,7 +347,7 @@ private static ExecutableWork createMockWork( Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( computationId, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, @@ -3407,7 +3422,7 @@ public void testLatencyAttributionProtobufsPopulated() { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), clock, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 72b8cff6ebd53..9f8e4c2dfc140 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -61,6 +61,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -134,14 +135,26 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}, - mock(HeartbeatSender.class)), + COMPUTATION_ID, createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + @Test public void testTimerInternalsSetTimer() { Windmill.WorkItemCommitRequest.Builder outputBuilder = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java index 4f4d7420dc48e..504b50daa3dce 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java @@ -95,6 +95,7 @@ import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader; import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader.NativeReaderIterator; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -198,14 +199,26 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}, - mock(HeartbeatSender.class)), + COMPUTATION_ID, createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private static class SourceProducingSubSourcesInSplit extends MockSource { int numDesiredBundle; int sourceObjectSize; @@ -1001,7 +1014,7 @@ public void testFailedWorkItemsAbort() throws Exception { Watermarks.builder().setInputDataWatermark(new Instant(0)).build(), Work.createProcessingContext( COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index 6db9634d4ab22..663edcbcdb75d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -32,6 +32,7 @@ import java.util.Optional; import org.apache.beam.runners.dataflow.worker.streaming.ActiveWorkState.ActivateWorkResult; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -81,10 +82,22 @@ private static ExecutableWork expiredWork(Windmill.WorkItem workItem) { private static Work.ProcessingContext createWorkProcessingContext() { return Work.createProcessingContext( - "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}, - mock(HeartbeatSender.class)); + "computationId", createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)); + } + + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; } private static WorkId workId(long workToken, long cacheToken) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java index 84a7c593d1530..658f12cf70ee0 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java @@ -36,6 +36,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -69,7 +70,7 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, @@ -77,6 +78,21 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, ignored -> {}); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + @Before public void setUp() { computationStateCache = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java index c27815500ed26..24a93f58b12a8 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java @@ -67,13 +67,46 @@ @SuppressWarnings("deprecation") @RunWith(JUnit4.class) public class SideInputStateFetcherTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private static final String STATE_FAMILY = "state"; - + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); @Mock private GetDataClient server; @Mock private Supplier readStateSupplier; + private static Windmill.GlobalData buildGlobalDataResponse( + String tag, boolean isReady, ByteString data) { + Windmill.GlobalData.Builder builder = + Windmill.GlobalData.newBuilder() + .setDataId( + Windmill.GlobalDataId.newBuilder() + .setTag(tag) + .setVersion(ByteString.EMPTY) + .build()); + + if (isReady) { + builder.setIsReady(true).setData(data); + } else { + builder.setIsReady(false); + } + return builder.build(); + } + + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { + Windmill.GlobalDataId id = + Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build(); + + return Windmill.GlobalDataRequest.newBuilder() + .setDataId(id) + .setStateFamily(STATE_FAMILY) + .setExistenceWatermarkDeadline( + TimeUnit.MILLISECONDS.toMicros(GlobalWindow.INSTANCE.maxTimestamp().getMillis())) + .build(); + } + + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag) { + return buildGlobalDataRequest(tag, ByteString.EMPTY); + } + @Before public void setUp() { MockitoAnnotations.initMocks(this); @@ -81,10 +114,10 @@ public void setUp() { @Test public void testFetchGlobalDataBasic() throws Exception { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteStringOutputStream stream = new ByteStringOutputStream(); ListCoder.of(StringUtf8Coder.of()) @@ -152,10 +185,10 @@ public void testFetchGlobalDataBasic() throws Exception { @Test public void testFetchGlobalDataNull() throws Exception { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteStringOutputStream stream = new ByteStringOutputStream(); ListCoder.of(VoidCoder.of()) @@ -311,10 +344,10 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { @Test public void testEmptyFetchGlobalData() { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteString encodedIterable = ByteString.EMPTY; @@ -346,38 +379,4 @@ public void testEmptyFetchGlobalData() { verify(server).getSideInputData(buildGlobalDataRequest(tag)); verifyNoMoreInteractions(server); } - - private static Windmill.GlobalData buildGlobalDataResponse( - String tag, boolean isReady, ByteString data) { - Windmill.GlobalData.Builder builder = - Windmill.GlobalData.newBuilder() - .setDataId( - Windmill.GlobalDataId.newBuilder() - .setTag(tag) - .setVersion(ByteString.EMPTY) - .build()); - - if (isReady) { - builder.setIsReady(true).setData(data); - } else { - builder.setIsReady(false); - } - return builder.build(); - } - - private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { - Windmill.GlobalDataId id = - Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build(); - - return Windmill.GlobalDataRequest.newBuilder() - .setDataId(id) - .setStateFamily(STATE_FAMILY) - .setExistenceWatermarkDeadline( - TimeUnit.MILLISECONDS.toMicros(GlobalWindow.INSTANCE.maxTimestamp().getMillis())) - .build(); - } - - private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag) { - return buildGlobalDataRequest(tag, ByteString.EMPTY); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java index 35f4aad65ae74..ef73d4b0ef27d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java @@ -32,6 +32,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -67,7 +68,7 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, @@ -75,6 +76,21 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { executeWorkFn); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private Runnable createSleepProcessWorkFn(CountDownLatch start, CountDownLatch stop) { Runnable runnable = () -> { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java index cfaed7ba5289b..37ab2c863c79b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java @@ -36,6 +36,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -65,7 +66,7 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> { throw new UnsupportedOperationException(); }, @@ -74,6 +75,21 @@ private static Work createMockWork(long workToken) { Collections.emptyList()); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java index 6c44d3d954e24..36d48d778e8cc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java @@ -50,6 +50,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -82,7 +83,7 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> { throw new UnsupportedOperationException(); }, @@ -91,6 +92,21 @@ private static Work createMockWork(long workToken) { Collections.emptyList()); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java index bc3afaff1b385..1999dbe319027 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java @@ -33,13 +33,13 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; -import java.util.Optional; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillMetadataServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; @@ -48,6 +48,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory; import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactory; @@ -97,7 +98,6 @@ public class StreamingEngineClientTest { .build(); @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private final MutableHandlerRegistry serviceRegistry = new MutableHandlerRegistry(); private final GrpcWindmillStreamFactory streamFactory = spy(GrpcWindmillStreamFactory.of(JOB_HEADER).build()); @@ -109,7 +109,7 @@ public class StreamingEngineClientTest { private final GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.forTesting( stubFactory, new ArrayList<>(), new ArrayList<>(), new HashSet<>()); - + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private Server fakeStreamingEngineServer; private CountDownLatch getWorkerMetadataReady; private GetWorkerMetadataTestStub fakeGetWorkerMetadataStub; @@ -181,7 +181,8 @@ private StreamingEngineClient newStreamingEngineClient( getWorkBudgetDistributor, dispatcherClient, CLIENT_ID, - ignored -> mock(WorkCommitter.class)); + ignored -> mock(WorkCommitter.class), + new ThrottlingGetDataMetricTracker(mock(MemoryMonitor.class))); } @Test @@ -222,8 +223,6 @@ public void testStreamsStartCorrectly() throws InterruptedException { Set workerTokens = currentConnections.windmillConnections().values().stream() .map(WindmillConnection::backendWorkerToken) - .filter(Optional::isPresent) - .map(Optional::get) .collect(Collectors.toSet()); assertTrue(workerTokens.contains(workerToken)); @@ -235,7 +234,13 @@ public void testStreamsStartCorrectly() throws InterruptedException { verify(streamFactory, times(2)) .createDirectGetWorkStream( - any(), eq(getWorkRequest(0, 0)), any(), any(), any(), eq(noOpProcessWorkItemFn())); + any(), + eq(getWorkRequest(0, 0)), + any(), + any(), + any(), + any(), + eq(noOpProcessWorkItemFn())); verify(streamFactory, times(2)).createGetDataStream(any(), any()); verify(streamFactory, times(2)).createCommitWorkStream(any(), any()); @@ -312,8 +317,6 @@ public void testOnNewWorkerMetadata_correctlyRemovesStaleWindmillServers() Set workerTokens = streamingEngineClient.getCurrentConnections().windmillConnections().values().stream() .map(WindmillConnection::backendWorkerToken) - .filter(Optional::isPresent) - .map(Optional::get) .collect(Collectors.toSet()); assertFalse(workerTokens.contains(workerToken)); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java index 9017b673c0088..9d49c3ef3146d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java @@ -34,6 +34,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; @@ -108,6 +109,7 @@ public void testStartStream_startsAllStreams() { any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); verify(streamFactory).createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); @@ -139,6 +141,7 @@ public void testStartStream_onlyStartsStreamsOnce() { any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); verify(streamFactory, times(1)) @@ -175,6 +178,7 @@ public void testStartStream_onlyStartsStreamsOnceConcurrent() throws Interrupted any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); verify(streamFactory, times(1)) @@ -210,6 +214,7 @@ public void testCloseAllStreams_closesAllStreams() { any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler))) .thenReturn(mockGetWorkStream); @@ -226,9 +231,9 @@ public void testCloseAllStreams_closesAllStreams() { windmillStreamSender.startStreams(); windmillStreamSender.closeAllStreams(); - verify(mockGetWorkStream).halfClose(); - verify(mockGetDataStream).halfClose(); - verify(mockCommitWorkStream).halfClose(); + verify(mockGetWorkStream).shutdown(); + verify(mockGetDataStream).shutdown(); + verify(mockCommitWorkStream).shutdown(); } private WindmillStreamSender newWindmillStreamSender(GetWorkBudget budget) { @@ -243,6 +248,7 @@ private WindmillStreamSender newWindmillStreamSender( budget, streamFactory, workItemScheduler, + ignored -> mock(GetDataClient.class), ignored -> mock(WorkCommitter.class)); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java index 68f4559bfd3f0..b0c305dc4ec45 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java @@ -33,6 +33,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; @@ -260,6 +261,7 @@ private WindmillStreamSender createWindmillStreamSender(GetWorkBudget getWorkBud .build()) .build(), (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}, + ignored -> mock(GetDataClient.class), ignored -> mock(WorkCommitter.class)); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java index 079c6b4640449..ea90bb276a4bb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java @@ -35,6 +35,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -88,7 +89,7 @@ private static ExecutableWork createWork(Supplier clock, Consumer Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), clock, @@ -96,6 +97,21 @@ private static ExecutableWork createWork(Supplier clock, Consumer processWorkFn); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private static ExecutableWork createWork(Consumer processWorkFn) { return createWork(Instant::now, processWorkFn); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index d6052270966cf..dbd5959293167 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -47,6 +47,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.direct.Clock; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; @@ -93,6 +94,21 @@ private static ComputationState createComputationState( stateCache); } + private static GetDataClient createMockGetDataClient() { + return new GetDataClient() { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computation, Windmill.KeyedGetDataRequest request) { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + return Windmill.GlobalData.getDefaultInstance(); + } + }; + } + private ActiveWorkRefresher createActiveWorkRefresher( Supplier clock, int activeWorkRefreshPeriodMillis, @@ -126,10 +142,7 @@ private ExecutableWork createOldWork( .build(), Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( - "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}, - heartbeatSender), + "computationId", createMockGetDataClient(), ignored -> {}, heartbeatSender), A_LONG_TIME_AGO, ImmutableList.of()), processWork); From 3e7f12469df5c226e14a0291670102c522553ce7 Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Fri, 12 Jul 2024 23:28:25 +0900 Subject: [PATCH 5/7] don't let FanOutWorkRefreshClient exceptions crash the user worker --- .../worker/StreamingDataflowWorker.java | 2 -- .../getdata/FanOutWorkRefreshClient.java | 17 +++++++++++-- .../processing/StreamingWorkScheduler.java | 1 - .../work/refresh/ActiveWorkRefresher.java | 24 +++++++++---------- .../refresh/FixedStreamHeartbeatSender.java | 5 ++++ 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 58e7a370366fc..50e4dc2d2c855 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -316,7 +316,6 @@ private StreamingDataflowWorker( this.workerStatusReporter = workerStatusReporter; this.streamingCounters = streamingCounters; this.memoryMonitor = memoryMonitor; - this.streamingWorkScheduler = StreamingWorkScheduler.create( options, @@ -325,7 +324,6 @@ private StreamingDataflowWorker( mapTaskExecutorFactory, workUnitExecutor, stateCache::forComputation, - getDataClient::getSideInputData, failureTracker, workFailureProcessor, streamingCounters, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java index 426fb7b02f7ec..79cde43ffc24b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java @@ -27,6 +27,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * {@link WorkRefreshClient} that fans out heartbeats to all {@link HeartbeatSender}(s) in parallel @@ -34,6 +36,7 @@ */ @Internal public final class FanOutWorkRefreshClient implements WorkRefreshClient { + private static final Logger LOG = LoggerFactory.getLogger(FanOutWorkRefreshClient.class); private static final String FAN_OUT_REFRESH_WORK_EXECUTOR_NAME = "FanOutActiveWorkRefreshExecutor"; @@ -44,7 +47,13 @@ public FanOutWorkRefreshClient(ThrottlingGetDataMetricTracker getDataMetricTrack this.getDataMetricTracker = getDataMetricTracker; this.fanOutActiveWorkRefreshExecutor = Executors.newCachedThreadPool( - new ThreadFactoryBuilder().setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME).build()); + new ThreadFactoryBuilder() + // FanOutWorkRefreshClient runs as a background process, don't let failures crash + // the worker. + .setUncaughtExceptionHandler( + (t, e) -> LOG.error("Unexpected failure in {}", t.getName(), e)) + .setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME) + .build()); } @Override @@ -71,7 +80,11 @@ private CompletableFuture sendHeartbeatOnStreamFuture( Heartbeats heartbeats = heartbeat.getValue(); sender.sendHeartbeats(heartbeats); } catch (Exception e) { - throw new GetDataClient.GetDataException("Error refreshing heartbeats.", e); + LOG.error( + "Unable to send {} heartbeats to {}.", + heartbeat.getValue().size(), + heartbeat.getKey(), + new GetDataClient.GetDataException("Error refreshing heartbeats.", e)); } }, fanOutActiveWorkRefreshExecutor); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index 0fa60da71531c..95e5c42bf59c3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -118,7 +118,6 @@ public static StreamingWorkScheduler create( DataflowMapTaskExecutorFactory mapTaskExecutorFactory, BoundedQueueExecutor workExecutor, Function stateCacheFactory, - Function fetchGlobalDataFn, FailureTracker failureTracker, WorkFailureProcessor workFailureProcessor, StreamingCounters streamingCounters, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 7f8e3bee66fef..5c79fb1ee402b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -30,7 +30,7 @@ import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.sdk.annotations.Internal; import org.joda.time.Duration; import org.joda.time.Instant; @@ -74,16 +74,6 @@ public ActiveWorkRefresher( this.heartbeatSender = heartbeatSender; } - private static Windmill.HeartbeatRequest createHeartbeatRequest( - RefreshableWork work, DataflowExecutionStateSampler sampler) { - return Windmill.HeartbeatRequest.newBuilder() - .setShardingKey(work.getShardedKey().shardingKey()) - .setWorkToken(work.id().workToken()) - .setCacheToken(work.id().cacheToken()) - .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) - .build(); - } - @SuppressWarnings("FutureReturnValueIgnored") public void start() { if (activeWorkRefreshPeriodMillis > 0) { @@ -136,8 +126,7 @@ private void refreshActiveWork() { heartbeatsBySender .computeIfAbsent(work.heartbeatSender(), ignored -> Heartbeats.builder()) .addWork(work) - .addHeartbeatRequest( - computationState.getComputationId(), createHeartbeatRequest(work, sampler)); + .addHeartbeatRequest(computationState.getComputationId(), createHeartbeatRequest(work)); } } @@ -145,4 +134,13 @@ private void refreshActiveWork() { heartbeatsBySender.entrySet().stream() .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build()))); } + + private HeartbeatRequest createHeartbeatRequest(RefreshableWork work) { + return HeartbeatRequest.newBuilder() + .setShardingKey(work.getShardedKey().shardingKey()) + .setWorkToken(work.id().workToken()) + .setCacheToken(work.id().cacheToken()) + .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) + .build(); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java index 7da48d4f0218d..a03ff4b430979 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java @@ -71,4 +71,9 @@ public boolean equals(Object obj) { return obj instanceof FixedStreamHeartbeatSender && getDataStream.equals(((FixedStreamHeartbeatSender) obj).getDataStream); } + + @Override + public String toString() { + return "HeartbeatSender-" + getDataStream.backendWorkerToken(); + } } From d5acf0fd9018cb7e4481d1af011ea1c75490baf5 Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Fri, 26 Jul 2024 00:52:46 +0900 Subject: [PATCH 6/7] address PR comments --- .../worker/StreamingDataflowWorker.java | 72 ++++----- .../worker/WorkItemCancelledException.java | 4 + .../worker/streaming/ActiveWorkState.java | 7 +- .../worker/streaming/RefreshableWork.java | 3 - .../dataflow/worker/streaming/Work.java | 9 -- .../sideinput/SideInputStateFetcher.java | 2 + .../client/AbstractWindmillStream.java | 26 ++-- .../windmill/client/WindmillStream.java | 36 +---- .../commits/StreamingEngineWorkCommitter.java | 114 ++++++++------ .../getdata/ApplianceGetDataClient.java | 55 ++----- .../getdata/FanOutWorkRefreshClient.java | 92 ------------ .../client/getdata/GetDataClient.java | 24 ++- ...taClient.java => StreamGetDataClient.java} | 63 ++++---- ...ient.java => StreamPoolGetDataClient.java} | 44 ++---- .../ThrottlingGetDataMetricTracker.java | 127 ++++++---------- .../client/grpc/GrpcCommitWorkStream.java | 1 + .../client/grpc/GrpcDirectGetWorkStream.java | 3 +- .../client/grpc/GrpcGetDataStream.java | 5 + .../client/grpc/GrpcGetWorkStream.java | 1 + .../grpc/GrpcGetWorkerMetadataStream.java | 1 + .../client/grpc/StreamingEngineClient.java | 4 +- .../work/refresh/ActiveWorkRefresher.java | 91 +++++++++--- .../refresh/FixedStreamHeartbeatSender.java | 16 +- .../work/refresh/HeartbeatSender.java | 6 +- .../windmill/work/refresh/Heartbeats.java | 18 ++- .../dataflow/worker/FakeWindmillServer.java | 15 -- .../worker/StreamingDataflowWorkerTest.java | 23 +-- .../StreamingModeExecutionContextTest.java | 19 +-- .../worker/WorkerCustomSourcesTest.java | 21 +-- .../worker/streaming/ActiveWorkStateTest.java | 19 +-- .../streaming/ComputationStateCacheTest.java | 19 +-- .../worker/util/BoundedQueueExecutorTest.java | 35 ++--- .../client/WindmillStreamPoolTest.java | 10 -- .../StreamingApplianceWorkCommitterTest.java | 19 +-- .../StreamingEngineWorkCommitterTest.java | 55 +++---- .../client/getdata/FakeGetDataClient.java} | 24 ++- .../ThrottlingGetDataMetricTrackerTest.java | 139 +++--------------- .../failures/WorkFailureProcessorTest.java | 19 +-- .../work/refresh/ActiveWorkRefresherTest.java | 80 ++++------ 39 files changed, 483 insertions(+), 838 deletions(-) delete mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/{DirectGetDataClient.java => StreamGetDataClient.java} (63%) rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/{StreamingEngineGetDataClient.java => StreamPoolGetDataClient.java} (63%) rename runners/google-cloud-dataflow-java/worker/src/{main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java => test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java} (56%) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 50e4dc2d2c855..c30e1620a5e11 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -75,9 +75,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ApplianceGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamingEngineGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamPoolGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.WorkRefreshClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.ChannelzServlet; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; @@ -217,12 +216,16 @@ private StreamingDataflowWorker( this.workCommitter = windmillServiceEnabled - ? StreamingEngineWorkCommitter.create( - WindmillStreamPool.create( - numCommitThreads, COMMIT_STREAM_TIMEOUT, windmillServer::commitWorkStream) - ::getCloseableStream, - numCommitThreads, - this::onCompleteCommit) + ? StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory( + WindmillStreamPool.create( + numCommitThreads, + COMMIT_STREAM_TIMEOUT, + windmillServer::commitWorkStream) + ::getCloseableStream) + .setNumCommitSenders(numCommitThreads) + .setOnCommitComplete(this::onCompleteCommit) + .build() : StreamingApplianceWorkCommitter.create( windmillServer::commitWork, this::onCompleteCommit); @@ -252,31 +255,26 @@ private StreamingDataflowWorker( ThrottlingGetDataMetricTracker getDataMetricTracker = new ThrottlingGetDataMetricTracker(memoryMonitor); - WindmillStreamPool getDataStreamPool = - WindmillStreamPool.create( - Math.max(1, options.getWindmillGetDataStreamCount()), - GET_DATA_STREAM_TIMEOUT, - windmillServer::getDataStream); - - // Register standard file systems. - FileSystems.setDefaultPipelineOptions(options); - - int stuckCommitDurationMillis = - windmillServiceEnabled && options.getStuckCommitDurationMillis() > 0 - ? options.getStuckCommitDurationMillis() - : 0; - - WorkRefreshClient workRefreshClient; + int stuckCommitDurationMillis; if (windmillServiceEnabled) { - StreamingEngineGetDataClient streamingEngineGetDataClient = - new StreamingEngineGetDataClient(getDataMetricTracker, getDataStreamPool); - this.getDataClient = streamingEngineGetDataClient; - workRefreshClient = streamingEngineGetDataClient; + WindmillStreamPool getDataStreamPool = + WindmillStreamPool.create( + Math.max(1, options.getWindmillGetDataStreamCount()), + GET_DATA_STREAM_TIMEOUT, + windmillServer::getDataStream); + this.getDataClient = new StreamPoolGetDataClient(getDataMetricTracker, getDataStreamPool); + this.heartbeatSender = + new StreamPoolHeartbeatSender( + options.getUseSeparateWindmillHeartbeatStreams() + ? WindmillStreamPool.create( + 1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream) + : getDataStreamPool); + stuckCommitDurationMillis = + options.getStuckCommitDurationMillis() > 0 ? options.getStuckCommitDurationMillis() : 0; } else { - ApplianceGetDataClient applianceGetDataClient = - new ApplianceGetDataClient(windmillServer, getDataMetricTracker); - this.getDataClient = applianceGetDataClient; - workRefreshClient = applianceGetDataClient; + this.getDataClient = new ApplianceGetDataClient(windmillServer, getDataMetricTracker); + this.heartbeatSender = new ApplianceHeartbeatSender(windmillServer::getData); + stuckCommitDurationMillis = 0; } this.activeWorkRefresher = @@ -287,7 +285,7 @@ private StreamingDataflowWorker( computationStateCache::getAllPresentComputations, sampler, executorSupplier.apply("RefreshWork"), - workRefreshClient::refreshActiveWork); + getDataMetricTracker::trackHeartbeats); WorkerStatusPages workerStatusPages = WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); @@ -333,14 +331,8 @@ private StreamingDataflowWorker( ID_GENERATOR, stageInfoMap); - this.heartbeatSender = - options.isEnableStreamingEngine() - ? new StreamPoolHeartbeatSender( - options.getUseSeparateWindmillHeartbeatStreams() - ? WindmillStreamPool.create( - 1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream) - : getDataStreamPool) - : new ApplianceHeartbeatSender(windmillServer::getData); + // Register standard file systems. + FileSystems.setDefaultPipelineOptions(options); LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java index 934977fe0985e..ec5122a8732ab 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java @@ -26,6 +26,10 @@ public WorkItemCancelledException(long sharding_key) { super("Work item cancelled for key " + sharding_key); } + public WorkItemCancelledException(Throwable e) { + super(e); + } + /** Returns whether an exception was caused by a {@link WorkItemCancelledException}. */ public static boolean isWorkItemCancelledException(Throwable t) { while (t != null) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 64309d0a75010..56b0e3f539a50 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -206,14 +206,17 @@ synchronized ImmutableListMultimap getReadOnlyActiv .collect( flatteningToImmutableListMultimap( Entry::getKey, - e -> e.getValue().stream().map(ExecutableWork::work).map(Work::refreshableView))); + e -> + e.getValue().stream() + .map(ExecutableWork::work) + .map(work -> (RefreshableWork) work))); } synchronized ImmutableList getRefreshableWork(Instant refreshDeadline) { return activeWork.values().stream() .flatMap(Deque::stream) .map(ExecutableWork::work) - .filter(work -> work.isRefreshable(refreshDeadline)) + .filter(work -> !work.isFailed() && work.getStartTime().isBefore(refreshDeadline)) .collect(toImmutableList()); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java index a1668d9ae7851..c51b04f23719f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java @@ -22,7 +22,6 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.joda.time.Instant; /** View of {@link Work} that exposes an interface for work refreshing. */ @Internal @@ -32,8 +31,6 @@ public interface RefreshableWork { ShardedKey getShardedKey(); - boolean isRefreshable(Instant refreshDeadline); - HeartbeatSender heartbeatSender(); ImmutableList getHeartbeatLatencyAttributions( diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index 71ffd98ac1c03..e77823602eda7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -150,10 +150,6 @@ private static LatencyAttribution.Builder createLatencyAttributionWithActiveLate return latencyAttribution; } - public RefreshableWork refreshableView() { - return this; - } - public WorkItem getWorkItem() { return workItem; } @@ -209,11 +205,6 @@ public String getLatencyTrackingId() { return latencyTrackingId; } - @Override - public boolean isRefreshable(Instant refreshDeadline) { - return !isFailed && getStartTime().isBefore(refreshDeadline); - } - @Override public HeartbeatSender heartbeatSender() { return processingContext.heartbeatSender(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java index 113b760556dfd..303cdeb94f8c6 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java @@ -34,6 +34,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.coders.KvCoder; @@ -52,6 +53,7 @@ /** Class responsible for fetching side input state from the streaming backend. */ @NotThreadSafe +@Internal public class SideInputStateFetcher { private static final Logger LOG = LoggerFactory.getLogger(SideInputStateFetcher.class); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index b00c4c9c0c7fe..fd0d1b1a3a92d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -85,10 +85,11 @@ public abstract class AbstractWindmillStream implements Win private final Supplier> requestObserverSupplier; // Indicates if the current stream in requestObserver is closed by calling close() method private final AtomicBoolean streamClosed; - private @Nullable StreamObserver requestObserver; private final String backendWorkerToken; + private @Nullable StreamObserver requestObserver; protected AbstractWindmillStream( + String debugStreamType, Function, StreamObserver> clientFactory, BackOff backoff, StreamObserverFactory streamObserverFactory, @@ -100,7 +101,7 @@ protected AbstractWindmillStream( Executors.newSingleThreadExecutor( new ThreadFactoryBuilder() .setDaemon(true) - .setNameFormat(createThreadName(streamType(), backendWorkerToken)) + .setNameFormat(createThreadName(debugStreamType, backendWorkerToken)) .build()); this.backoff = backoff; this.streamRegistry = streamRegistry; @@ -122,10 +123,10 @@ protected AbstractWindmillStream( clientFactory, new AbstractWindmillStream.ResponseObserver()); } - private static String createThreadName(Type streamType, String backendWorkerToken) { + private static String createThreadName(String streamType, String backendWorkerToken) { return !backendWorkerToken.isEmpty() - ? String.format("%s-%s-WindmillStream-thread", streamType.name(), backendWorkerToken) - : String.format("%s-WindmillStream-thread", streamType.name()); + ? String.format("%s-%s-WindmillStream-thread", streamType, backendWorkerToken) + : String.format("%s-WindmillStream-thread", streamType); } private static long debugDuration(long nowMs, long startMs) { @@ -151,6 +152,11 @@ private static long debugDuration(long nowMs, long startMs) { */ protected abstract void startThrottleTimer(); + /** Reflects that {@link #shutdown()} was explicitly called. */ + protected boolean isShutdown() { + return isShutdown.get(); + } + private StreamObserver requestObserver() { if (requestObserver == null) { throw new NullPointerException( @@ -274,15 +280,11 @@ public String backendWorkerToken() { @Override public void shutdown() { if (isShutdown.compareAndSet(false, true)) { - halfClose(); + requestObserver() + .onError(new WindmillStreamShutdownException("Explicit call to shutdown stream.")); } } - @Override - public boolean isShutdown() { - return isShutdown.get(); - } - private void setLastError(String error) { lastError.set(error); lastErrorTime.set(DateTime.now()); @@ -313,7 +315,7 @@ public void onCompleted() { private void onStreamFinished(@Nullable Throwable t) { synchronized (this) { - if (clientClosed.get() && !hasPendingRequests()) { + if (isShutdown.get() || (clientClosed.get() && !hasPendingRequests())) { streamRegistry.remove(AbstractWindmillStream.this); finishLatch.countDown(); return; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index a4bfa69ad7798..ee467c01c8f6e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -52,18 +52,6 @@ public interface WindmillStream { */ void shutdown(); - /** Reflects that {@link #shutdown()} was explicitly called. */ - boolean isShutdown(); - - Type streamType(); - - enum Type { - GET_WORKER_METADATA, - GET_WORK, - GET_DATA, - COMMIT_WORK, - } - /** Handle representing a stream of GetWork responses. */ @ThreadSafe interface GetWorkStream extends WindmillStream { @@ -72,11 +60,6 @@ interface GetWorkStream extends WindmillStream { /** Returns the remaining in-flight {@link GetWorkBudget}. */ GetWorkBudget remainingBudget(); - - @Override - default Type streamType() { - return Type.GET_WORK; - } } /** Interface for streaming GetDataRequests to Windmill. */ @@ -93,11 +76,6 @@ Windmill.KeyedGetDataResponse requestKeyedData( void refreshActiveWork(Map> heartbeats); void onHeartbeatResponse(List responses); - - @Override - default Type streamType() { - return Type.GET_DATA; - } } /** Interface for streaming CommitWorkRequests to Windmill. */ @@ -109,11 +87,6 @@ interface CommitWorkStream extends WindmillStream { */ CommitWorkStream.RequestBatcher batcher(); - @Override - default Type streamType() { - return Type.COMMIT_WORK; - } - @NotThreadSafe interface RequestBatcher extends Closeable { /** @@ -140,10 +113,11 @@ default void close() { /** Interface for streaming GetWorkerMetadata requests to Windmill. */ @ThreadSafe - interface GetWorkerMetadataStream extends WindmillStream { - @Override - default Type streamType() { - return Type.GET_WORKER_METADATA; + interface GetWorkerMetadataStream extends WindmillStream {} + + class WindmillStreamShutdownException extends RuntimeException { + public WindmillStreamShutdownException(String message) { + super(message); } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java index 911b6809c2429..afdb29560a2b2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java @@ -17,6 +17,7 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.commits; +import com.google.auto.value.AutoBuilder; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -55,10 +56,11 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { private final int numCommitSenders; private final AtomicBoolean isRunning; - private StreamingEngineWorkCommitter( + StreamingEngineWorkCommitter( Supplier> commitWorkStreamFactory, int numCommitSenders, - Consumer onCommitComplete) { + Consumer onCommitComplete, + String backendWorkerToken) { this.commitWorkStreamFactory = commitWorkStreamFactory; this.commitQueue = WeightedBoundedQueue.create( @@ -69,7 +71,10 @@ private StreamingEngineWorkCommitter( new ThreadFactoryBuilder() .setDaemon(true) .setPriority(Thread.MAX_PRIORITY) - .setNameFormat("CommitThread-%d") + .setNameFormat( + backendWorkerToken.isEmpty() + ? "CommitThread-%d" + : "CommitThread-" + backendWorkerToken + "-%d") .build()); this.activeCommitBytes = new AtomicLong(); this.onCommitComplete = onCommitComplete; @@ -77,32 +82,33 @@ private StreamingEngineWorkCommitter( this.isRunning = new AtomicBoolean(false); } - public static StreamingEngineWorkCommitter create( - Supplier> commitWorkStreamFactory, - int numCommitSenders, - Consumer onCommitComplete) { - return new StreamingEngineWorkCommitter( - commitWorkStreamFactory, numCommitSenders, onCommitComplete); + public static Builder builder() { + return new AutoBuilder_StreamingEngineWorkCommitter_Builder() + .setBackendWorkerToken("") + .setNumCommitSenders(1); } @Override @SuppressWarnings("FutureReturnValueIgnored") public void start() { - if (isRunning.compareAndSet(false, true) && !commitSenders.isShutdown()) { - for (int i = 0; i < numCommitSenders; i++) { - commitSenders.submit(this::streamingCommitLoop); - } + Preconditions.checkState( + isRunning.compareAndSet(false, true), "Multiple calls to WorkCommitter.start()."); + for (int i = 0; i < numCommitSenders; i++) { + commitSenders.submit(this::streamingCommitLoop); } } @Override public void commit(Commit commit) { - if (commit.work().isFailed() || !isRunning.get()) { - LOG.debug( - "Trying to queue commit on shutdown, failing commit=[computationId={}, shardingKey={}, workId={} ].", - commit.computationId(), - commit.work().getShardedKey(), - commit.work().id()); + boolean isShutdown = !this.isRunning.get(); + if (commit.work().isFailed() || isShutdown) { + if (isShutdown) { + LOG.debug( + "Trying to queue commit on shutdown, failing commit=[computationId={}, shardingKey={}, workId={} ].", + commit.computationId(), + commit.work().getShardedKey(), + commit.work().id()); + } failCommit(commit); } else { commitQueue.put(commit); @@ -116,17 +122,16 @@ public long currentActiveCommitBytes() { @Override public void stop() { - if (isRunning.compareAndSet(true, false) && !commitSenders.isTerminated()) { - commitSenders.shutdownNow(); - try { - commitSenders.awaitTermination(10, TimeUnit.SECONDS); - } catch (InterruptedException e) { - LOG.warn( - "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue.", - e); - } - drainCommitQueue(); + Preconditions.checkState(isRunning.compareAndSet(true, false)); + commitSenders.shutdownNow(); + try { + commitSenders.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + LOG.warn( + "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue.", + e); } + drainCommitQueue(); } private void drainCommitQueue() { @@ -150,7 +155,7 @@ public int parallelism() { private void streamingCommitLoop() { @Nullable Commit initialCommit = null; try { - while (true) { + while (isRunning.get()) { if (initialCommit == null) { try { // Block until we have a commit or are shutting down. @@ -169,17 +174,14 @@ private void streamingCommitLoop() { } try (CloseableStream closeableCommitStream = - commitWorkStreamFactory.get()) { - CommitWorkStream commitStream = closeableCommitStream.stream(); - try (CommitWorkStream.RequestBatcher batcher = commitStream.batcher()) { - if (!tryAddToCommitBatch(initialCommit, batcher)) { - throw new AssertionError( - "Initial commit on flushed stream should always be accepted."); - } - // Batch additional commits to the stream and possibly make an un-batched commit the - // next initial commit. - initialCommit = expandBatch(batcher); + commitWorkStreamFactory.get(); + CommitWorkStream.RequestBatcher batcher = closeableCommitStream.stream().batcher()) { + if (!tryAddToCommitBatch(initialCommit, batcher)) { + throw new AssertionError("Initial commit on flushed stream should always be accepted."); } + // Batch additional commits to the stream and possibly make an un-batched commit the + // next initial commit. + initialCommit = expandBatch(batcher); } catch (Exception e) { LOG.error("Error occurred sending commits.", e); } @@ -200,7 +202,7 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch batcher.commitWorkItem( commit.computationId(), commit.request(), - (commitStatus) -> { + commitStatus -> { onCommitComplete.accept(CompleteCommit.create(commit, commitStatus)); activeCommitBytes.addAndGet(-commit.getSize()); }); @@ -214,11 +216,13 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch return isCommitAccepted; } - // Helper to batch additional commits into the commit batch as long as they fit. - // Returns a commit that was removed from the queue but not consumed or null. - private Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { + /** + * Helper to batch additional commits into the commit batch as long as they fit. Returns a commit + * that was removed from the queue but not consumed or null. + */ + private @Nullable Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { int commits = 1; - while (true) { + while (isRunning.get()) { Commit commit; try { if (commits < TARGET_COMMIT_BATCH_KEYS) { @@ -245,5 +249,25 @@ private Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { } commits++; } + + return null; + } + + @AutoBuilder + public interface Builder { + Builder setCommitWorkStreamFactory( + Supplier> commitWorkStreamFactory); + + Builder setNumCommitSenders(int numCommitSenders); + + Builder setOnCommitComplete(Consumer onCommitComplete); + + Builder setBackendWorkerToken(String backendWorkerToken); + + StreamingEngineWorkCommitter autoBuild(); + + default WorkCommitter build() { + return autoBuild(); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java index dc5adb4e7966c..e0500dde0c538 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java @@ -29,18 +29,15 @@ import org.apache.beam.runners.dataflow.worker.windmill.ApplianceWindmillClient; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; import org.checkerframework.checker.nullness.qual.Nullable; /** Appliance implementation of {@link GetDataClient}. */ @Internal @ThreadSafe -public final class ApplianceGetDataClient implements GetDataClient, WorkRefreshClient { +public final class ApplianceGetDataClient implements GetDataClient { private static final int MAX_READS_PER_BATCH = 60; private static final int MAX_ACTIVE_READS = 10; @@ -61,19 +58,12 @@ public ApplianceGetDataClient( this.activeReadThreads = 0; } - public static GetDataClient create( - ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { - return new ApplianceGetDataClient(windmillClient, getDataMetricTracker); - } - @Override public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE)) { + String computationId, Windmill.KeyedGetDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { SettableFuture response = SettableFuture.create(); - ReadBatch batch = addToReadBatch(new QueueEntry(computation, request, response)); + ReadBatch batch = addToReadBatch(new QueueEntry(computationId, request, response)); if (batch != null) { issueReadBatch(batch); } @@ -81,7 +71,7 @@ public Windmill.KeyedGetDataResponse getStateData( } catch (Exception e) { throw new GetDataException( "Error occurred fetching state for computation=" - + computation + + computationId + ", key=" + request.getShardingKey(), e); @@ -90,9 +80,7 @@ public Windmill.KeyedGetDataResponse getStateData( @Override public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE)) { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { return windmillClient .getData(Windmill.GetDataRequest.newBuilder().addGlobalDataFetchRequests(request).build()) .getGlobalData(0); @@ -102,28 +90,6 @@ public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) } } - /** - * Appliance sends heartbeats (used to refresh active work) as KeyedGetDataRequests. So we must - * translate the HeartbeatRequest to a KeyedGetDataRequest. - */ - @Override - public void refreshActiveWork(Map heartbeats) { - Map.Entry heartbeat = - Iterables.getOnlyElement(heartbeats.entrySet()); - HeartbeatSender heartbeatSender = heartbeat.getKey(); - Heartbeats heartbeatToSend = heartbeat.getValue(); - - if (heartbeatToSend.heartbeatRequests().isEmpty()) { - return; - } - - try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeatToSend.size())) { - heartbeatSender.sendHeartbeats(heartbeatToSend); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); - } - } - @Override public synchronized void printHtml(PrintWriter writer) { getDataMetricTracker.printHtml(writer); @@ -133,7 +99,8 @@ public synchronized void printHtml(PrintWriter writer) { private void issueReadBatch(ReadBatch batch) { try { - Preconditions.checkState(batch.startRead.get()); + // Possibly block until the batch is allowed to start. + batch.startRead.get(); } catch (InterruptedException e) { // We don't expect this thread to be interrupted. To simplify handling, we just fall through // to issuing the call. @@ -191,7 +158,7 @@ private void issueReadBatch(ReadBatch batch) { } else { // Notify the thread responsible for issuing the next batch read. ReadBatch startBatch = pendingReadBatches.remove(0); - startBatch.startRead.set(true); + startBatch.startRead.set(null); } } } @@ -227,13 +194,13 @@ private void issueReadBatch(ReadBatch batch) { } ReadBatch batch = new ReadBatch(); batch.reads.add(entry); - batch.startRead.set(true); + batch.startRead.set(null); return batch; } private static final class ReadBatch { ArrayList reads = new ArrayList<>(); - SettableFuture startRead = SettableFuture.create(); + SettableFuture startRead = SettableFuture.create(); } private static final class QueueEntry { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java deleted file mode 100644 index 79cde43ffc24b..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FanOutWorkRefreshClient.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * {@link WorkRefreshClient} that fans out heartbeats to all {@link HeartbeatSender}(s) in parallel - * passed into {@link #refreshActiveWork(Map)} - */ -@Internal -public final class FanOutWorkRefreshClient implements WorkRefreshClient { - private static final Logger LOG = LoggerFactory.getLogger(FanOutWorkRefreshClient.class); - private static final String FAN_OUT_REFRESH_WORK_EXECUTOR_NAME = - "FanOutActiveWorkRefreshExecutor"; - - private final ThrottlingGetDataMetricTracker getDataMetricTracker; - private final ExecutorService fanOutActiveWorkRefreshExecutor; - - public FanOutWorkRefreshClient(ThrottlingGetDataMetricTracker getDataMetricTracker) { - this.getDataMetricTracker = getDataMetricTracker; - this.fanOutActiveWorkRefreshExecutor = - Executors.newCachedThreadPool( - new ThreadFactoryBuilder() - // FanOutWorkRefreshClient runs as a background process, don't let failures crash - // the worker. - .setUncaughtExceptionHandler( - (t, e) -> LOG.error("Unexpected failure in {}", t.getName(), e)) - .setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME) - .build()); - } - - @Override - public void refreshActiveWork(Map heartbeats) { - List> fanOutRefreshActiveWork = new ArrayList<>(); - for (Map.Entry heartbeat : heartbeats.entrySet()) { - fanOutRefreshActiveWork.add(sendHeartbeatOnStreamFuture(heartbeat)); - } - - // Don't block until we kick off all the refresh active work RPCs. - @SuppressWarnings("rawtypes") - CompletableFuture parallelFanOutRefreshActiveWork = - CompletableFuture.allOf(fanOutRefreshActiveWork.toArray(new CompletableFuture[0])); - parallelFanOutRefreshActiveWork.join(); - } - - private CompletableFuture sendHeartbeatOnStreamFuture( - Map.Entry heartbeat) { - return CompletableFuture.runAsync( - () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackHeartbeats(heartbeat.getValue().size())) { - HeartbeatSender sender = heartbeat.getKey(); - Heartbeats heartbeats = heartbeat.getValue(); - sender.sendHeartbeats(heartbeats); - } catch (Exception e) { - LOG.error( - "Unable to send {} heartbeats to {}.", - heartbeat.getValue().size(), - heartbeat.getKey(), - new GetDataClient.GetDataException("Error refreshing heartbeats.", e)); - } - }, - fanOutActiveWorkRefreshExecutor); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java index 4577b29f8850f..c732591bf12d1 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java @@ -27,18 +27,30 @@ /** Client for streaming backend GetData API. */ @Internal public interface GetDataClient { - KeyedGetDataResponse getStateData(String computation, KeyedGetDataRequest request); + /** + * Issues a blocking call to fetch state data for a specific computation and {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem}. + * + * @throws GetDataException when there was an unexpected error during the attempted fetch. + */ + KeyedGetDataResponse getStateData(String computationId, KeyedGetDataRequest request) + throws GetDataException; - GlobalData getSideInputData(GlobalDataRequest request); + /** + * Issues a blocking call to fetch side input data. + * + * @throws GetDataException when there was an unexpected error during the attempted fetch. + */ + GlobalData getSideInputData(GlobalDataRequest request) throws GetDataException; - default void printHtml(PrintWriter writer) {} + void printHtml(PrintWriter writer); - class GetDataException extends RuntimeException { - protected GetDataException(String message, Throwable cause) { + final class GetDataException extends RuntimeException { + GetDataException(String message, Throwable cause) { super(message, cause); } - public GetDataException(String message) { + GetDataException(String message) { super(message); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java similarity index 63% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java index 6ee86b6ae7241..b0625384641e2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/DirectGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java @@ -17,25 +17,27 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; +import java.io.PrintWriter; import java.util.function.Function; import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.sdk.annotations.Internal; /** {@link GetDataClient} that fetches data directly from a specific {@link GetDataStream}. */ @Internal -public final class DirectGetDataClient implements GetDataClient { +public final class StreamGetDataClient implements GetDataClient { - private final GetDataStream directGetDataStream; + private final GetDataStream getDataStream; private final Function sideInputGetDataStreamFactory; private final ThrottlingGetDataMetricTracker getDataMetricTracker; - private DirectGetDataClient( - GetDataStream directGetDataStream, + private StreamGetDataClient( + GetDataStream getDataStream, Function sideInputGetDataStreamFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { - this.directGetDataStream = directGetDataStream; + this.getDataStream = getDataStream; this.sideInputGetDataStreamFactory = sideInputGetDataStreamFactory; this.getDataMetricTracker = getDataMetricTracker; } @@ -44,51 +46,56 @@ public static GetDataClient create( GetDataStream getDataStream, Function sideInputGetDataStreamFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { - return new DirectGetDataClient( + return new StreamGetDataClient( getDataStream, sideInputGetDataStreamFactory, getDataMetricTracker); } + /** + * @throws WorkItemCancelledException when the fetch fails due to the stream being shutdown, + * indicating that the {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem} that triggered the + * fetch has been cancelled. + */ @Override public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - if (directGetDataStream.isShutdown()) { + String computationId, Windmill.KeyedGetDataRequest request) throws GetDataException { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + return getDataStream.requestKeyedData(computationId, request); + } catch (WindmillStream.WindmillStreamShutdownException e) { throw new WorkItemCancelledException(request.getShardingKey()); - } - - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE)) { - return directGetDataStream.requestKeyedData(computation, request); } catch (Exception e) { - if (directGetDataStream.isShutdown()) { - throw new WorkItemCancelledException(request.getShardingKey()); - } - throw new GetDataException( "Error occurred fetching state for computation=" - + computation + + computationId + ", key=" + request.getShardingKey(), e); } } + /** + * @throws WorkItemCancelledException when the fetch fails due to the stream being shutdown, + * indicating that the {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem} that triggered the + * fetch has been cancelled. + */ @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) + throws GetDataException { GetDataStream sideInputGetDataStream = sideInputGetDataStreamFactory.apply(request.getDataId().getTag()); - if (sideInputGetDataStream.isShutdown()) { - throw new GetDataException( - "Error occurred fetching side input for tag=" + request.getDataId()); - } - - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.SIDE_INPUT)) { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { return sideInputGetDataStream.requestGlobalData(request); + } catch (WindmillStream.WindmillStreamShutdownException e) { + throw new WorkItemCancelledException(e); } catch (Exception e) { throw new GetDataException( "Error occurred fetching side input for tag=" + request.getDataId(), e); } } + + @Override + public void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java similarity index 63% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java index 54967f039f2d2..d6b20e425b0ba 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamingEngineGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java @@ -18,7 +18,6 @@ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; import java.io.PrintWriter; -import java.util.Map; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; @@ -26,25 +25,22 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; /** * StreamingEngine implementation of {@link GetDataClient}. * - * @implNote Uses {@link WindmillStreamPool} to send/receive requests. Depending on options, may use - * a dedicated stream pool for heartbeats. + * @implNote Uses {@link WindmillStreamPool} to send requests. Depending on options, may use a + * dedicated stream pool for heartbeats. */ @Internal @ThreadSafe -public final class StreamingEngineGetDataClient implements GetDataClient, WorkRefreshClient { +public final class StreamPoolGetDataClient implements GetDataClient { private final WindmillStreamPool getDataStreamPool; private final ThrottlingGetDataMetricTracker getDataMetricTracker; - public StreamingEngineGetDataClient( + public StreamPoolGetDataClient( ThrottlingGetDataMetricTracker getDataMetricTracker, WindmillStreamPool getDataStreamPool) { this.getDataMetricTracker = getDataMetricTracker; @@ -53,16 +49,14 @@ public StreamingEngineGetDataClient( @Override public Windmill.KeyedGetDataResponse getStateData( - String computation, KeyedGetDataRequest request) { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.STATE); + String computationId, KeyedGetDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling(); CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { - return closeableStream.stream().requestKeyedData(computation, request); + return closeableStream.stream().requestKeyedData(computationId, request); } catch (Exception e) { throw new GetDataException( "Error occurred fetching state for computation=" - + computation + + computationId + ", key=" + request.getShardingKey(), e); @@ -71,9 +65,7 @@ public Windmill.KeyedGetDataResponse getStateData( @Override public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling( - ThrottlingGetDataMetricTracker.Type.SIDE_INPUT); + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling(); CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { return closeableStream.stream().requestGlobalData(request); } catch (Exception e) { @@ -82,24 +74,6 @@ public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { } } - @Override - public void refreshActiveWork(Map heartbeats) { - Map.Entry heartbeat = - Iterables.getOnlyElement(heartbeats.entrySet()); - HeartbeatSender heartbeatSender = heartbeat.getKey(); - Heartbeats heartbeatToSend = heartbeat.getValue(); - - if (heartbeatToSend.heartbeatRequests().isEmpty()) { - return; - } - - try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(heartbeatToSend.size())) { - heartbeatSender.sendHeartbeats(heartbeatToSend); - } catch (Exception e) { - throw new GetDataException("Error occurred refreshing heartbeats=" + heartbeatToSend, e); - } - } - @Override public void printHtml(PrintWriter writer) { getDataMetricTracker.printHtml(writer); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java index d356f205817a4..a66cf932bd742 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java @@ -26,29 +26,45 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; /** - * Wraps GetData calls that tracks metrics for the number of in-flight requests and throttles - * requests when memory pressure is high. + * Wraps GetData calls to track metrics for the number of in-flight requests and throttles requests + * when memory pressure is high. */ @Internal @ThreadSafe public final class ThrottlingGetDataMetricTracker { + private static final String GET_STATE_DATA_RESOURCE_CONTEXT = "GetStateData"; + private static final String GET_SIDE_INPUT_RESOURCE_CONTEXT = "GetSideInputData"; + private final MemoryMonitor gcThrashingMonitor; - private final GetDataMetrics getDataMetrics; + private final AtomicInteger activeStateReads; + private final AtomicInteger activeSideInputs; + private final AtomicInteger activeHeartbeats; public ThrottlingGetDataMetricTracker(MemoryMonitor gcThrashingMonitor) { this.gcThrashingMonitor = gcThrashingMonitor; - this.getDataMetrics = GetDataMetrics.create(); + this.activeStateReads = new AtomicInteger(); + this.activeSideInputs = new AtomicInteger(); + this.activeHeartbeats = new AtomicInteger(); + } + + /** + * Tracks a state data fetch. If there is memory pressure, may throttle requests. Returns an + * {@link AutoCloseable} that will decrement the metric after the call is finished. + */ + AutoCloseable trackStateDataFetchWithThrottling() { + gcThrashingMonitor.waitForResources(GET_STATE_DATA_RESOURCE_CONTEXT); + activeStateReads.getAndIncrement(); + return activeStateReads::getAndDecrement; } /** - * Tracks a GetData call. If there is memory pressure, may throttle requests. Returns an {@link - * AutoCloseable} that will decrement the metric after the call is finished. + * Tracks a side input fetch. If there is memory pressure, may throttle requests. Returns an + * {@link AutoCloseable} that will decrement the metric after the call is finished. */ - public AutoCloseable trackSingleCallWithThrottling(Type callType) { - gcThrashingMonitor.waitForResources(callType.debugName); - AtomicInteger getDataMetricTracker = getDataMetrics.getMetricFor(callType); - getDataMetricTracker.getAndIncrement(); - return getDataMetricTracker::getAndDecrement; + AutoCloseable trackSideInputFetchWithThrottling() { + gcThrashingMonitor.waitForResources(GET_SIDE_INPUT_RESOURCE_CONTEXT); + activeSideInputs.getAndIncrement(); + return activeSideInputs::getAndDecrement; } /** @@ -56,91 +72,38 @@ public AutoCloseable trackSingleCallWithThrottling(Type callType) { * metric after the call is finished. */ public AutoCloseable trackHeartbeats(int numHeartbeats) { - getDataMetrics - .activeHeartbeats() - .getAndUpdate(currentActiveHeartbeats -> currentActiveHeartbeats + numHeartbeats); - return () -> - getDataMetrics.activeHeartbeats().getAndUpdate(existing -> existing - numHeartbeats); + activeHeartbeats.getAndUpdate( + currentActiveHeartbeats -> currentActiveHeartbeats + numHeartbeats); + return () -> activeHeartbeats.getAndUpdate(existing -> existing - numHeartbeats); } public void printHtml(PrintWriter writer) { writer.println("Active Fetches:"); - getDataMetrics.printMetrics(writer); + writer.println(" Side Inputs: " + activeSideInputs.get()); + writer.println(" State Reads: " + activeStateReads.get()); + writer.println("Heartbeat Keys Active: " + activeHeartbeats.get()); } @VisibleForTesting - GetDataMetrics.ReadOnlySnapshot getMetricsSnapshot() { - return getDataMetrics.snapshot(); - } - - public enum Type { - STATE("GetStateData"), - SIDE_INPUT("GetSideInputData"), - HEARTBEAT("RefreshActiveWork"); - private final String debugName; - - Type(String debugName) { - this.debugName = debugName; - } - - public final String debugName() { - return debugName; - } + ReadOnlySnapshot getMetricsSnapshot() { + return ReadOnlySnapshot.create( + activeSideInputs.get(), activeStateReads.get(), activeHeartbeats.get()); } + @VisibleForTesting @AutoValue - abstract static class GetDataMetrics { - private static GetDataMetrics create() { - return new AutoValue_ThrottlingGetDataMetricTracker_GetDataMetrics( - new AtomicInteger(), new AtomicInteger(), new AtomicInteger()); - } - - abstract AtomicInteger activeSideInputs(); - - abstract AtomicInteger activeStateReads(); - - abstract AtomicInteger activeHeartbeats(); - - private ReadOnlySnapshot snapshot() { - return ReadOnlySnapshot.create( - activeSideInputs().get(), activeStateReads().get(), activeHeartbeats().get()); - } + abstract static class ReadOnlySnapshot { - private AtomicInteger getMetricFor(Type callType) { - switch (callType) { - case STATE: - return activeStateReads(); - case SIDE_INPUT: - return activeSideInputs(); - case HEARTBEAT: - return activeHeartbeats(); - - default: - // Should never happen, switch is exhaustive. - throw new IllegalStateException("Unsupported CallType=" + callType); - } - } - - private void printMetrics(PrintWriter writer) { - writer.println(" Side Inputs: " + activeSideInputs().get()); - writer.println(" State Reads: " + activeStateReads().get()); - writer.println("Heartbeat Keys Active: " + activeHeartbeats().get()); + private static ReadOnlySnapshot create( + int activeSideInputs, int activeStateReads, int activeHeartbeats) { + return new AutoValue_ThrottlingGetDataMetricTracker_ReadOnlySnapshot( + activeSideInputs, activeStateReads, activeHeartbeats); } - @AutoValue - abstract static class ReadOnlySnapshot { + abstract int activeSideInputs(); - private static ReadOnlySnapshot create( - int activeSideInputs, int activeStateReads, int activeHeartbeats) { - return new AutoValue_ThrottlingGetDataMetricTracker_GetDataMetrics_ReadOnlySnapshot( - activeSideInputs, activeStateReads, activeHeartbeats); - } + abstract int activeStateReads(); - abstract int activeSideInputs(); - - abstract int activeStateReads(); - - abstract int activeHeartbeats(); - } + abstract int activeHeartbeats(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java index 232461e34e633..053843a8af253 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java @@ -69,6 +69,7 @@ private GrpcCommitWorkStream( AtomicLong idGenerator, int streamingRpcBatchLimit) { super( + "CommitWorkStream", startCommitWorkRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 0a582ea1c6292..58f72610e2d35 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -110,6 +110,7 @@ private GrpcDirectGetWorkStream( Supplier workCommitter, WorkItemScheduler workItemScheduler) { super( + "GetWorkStream", startGetWorkRpcFn, backoff, streamObserverFactory, @@ -120,8 +121,6 @@ private GrpcDirectGetWorkStream( this.getWorkThrottleTimer = getWorkThrottleTimer; this.workItemScheduler = workItemScheduler; this.workItemBuffers = new ConcurrentHashMap<>(); - // Use the same GetDataStream and CommitWorkStream instances to process all the work in this - // stream. this.heartbeatSender = Suppliers.memoize(heartbeatSender::get); this.workCommitter = Suppliers.memoize(workCommitter::get); this.getDataClient = Suppliers.memoize(getDataClient::get); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java index 5600a8f0f413b..0e9a0c6316ee0 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java @@ -90,6 +90,7 @@ private GrpcGetDataStream( boolean sendKeyedGetDataRequests, Consumer> processHeartbeatResponses) { super( + "GetDataStream", startGetDataRpcFn, backoff, streamObserverFactory, @@ -199,6 +200,10 @@ public GlobalData requestGlobalData(GlobalDataRequest request) { @Override public void refreshActiveWork(Map> heartbeats) { + if (isShutdown()) { + throw new WindmillStreamShutdownException("Unable to refresh work for shutdown stream."); + } + StreamingGetDataRequest.Builder builder = StreamingGetDataRequest.newBuilder(); if (sendKeyedGetDataRequests) { long builderBytes = 0; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java index 5fc093ee32aa9..4b392e9190ed2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java @@ -73,6 +73,7 @@ private GrpcGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver receiver) { super( + "GetWorkStream", startGetWorkRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java index 6f734b7da9dcb..44e21a9b18edd 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java @@ -65,6 +65,7 @@ private GrpcGetWorkerMetadataStream( ThrottleTimer getWorkerMetadataThrottleTimer, Consumer serverMappingConsumer) { super( + "GetWorkerMetadataStream", startGetWorkerMetadataRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java index 01fb6381cd4ae..b9573ff94cc9a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java @@ -45,7 +45,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.DirectGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; @@ -410,7 +410,7 @@ private WindmillStreamSender createAndStartWindmillStreamSenderFor( streamFactory, workItemScheduler, getDataStream -> - DirectGetDataClient.create( + StreamGetDataClient.create( getDataStream, this::getGlobalDataStream, getDataMetricTracker), workCommitterFactory); windmillStreamSender.startStreams(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 5c79fb1ee402b..c4dc375cdb020 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -19,19 +19,24 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap.toImmutableMap; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -48,14 +53,17 @@ @Internal public final class ActiveWorkRefresher { private static final Logger LOG = LoggerFactory.getLogger(ActiveWorkRefresher.class); + private static final String FAN_OUT_REFRESH_WORK_EXECUTOR_NAME = + "FanOutActiveWorkRefreshExecutor-%d"; private final Supplier clock; private final int activeWorkRefreshPeriodMillis; private final Supplier> computations; private final DataflowExecutionStateSampler sampler; private final int stuckCommitDurationMillis; + private final HeartbeatTracker heartbeatTracker; private final ScheduledExecutorService activeWorkRefreshExecutor; - private final Consumer> heartbeatSender; + private final ExecutorService fanOutActiveWorkRefreshExecutor; public ActiveWorkRefresher( Supplier clock, @@ -64,14 +72,23 @@ public ActiveWorkRefresher( Supplier> computations, DataflowExecutionStateSampler sampler, ScheduledExecutorService activeWorkRefreshExecutor, - Consumer> heartbeatSender) { + HeartbeatTracker heartbeatTracker) { this.clock = clock; this.activeWorkRefreshPeriodMillis = activeWorkRefreshPeriodMillis; this.stuckCommitDurationMillis = stuckCommitDurationMillis; this.computations = computations; this.sampler = sampler; this.activeWorkRefreshExecutor = activeWorkRefreshExecutor; - this.heartbeatSender = heartbeatSender; + this.heartbeatTracker = heartbeatTracker; + this.fanOutActiveWorkRefreshExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + // Work refresh runs as a background process, don't let failures crash + // the worker. + .setUncaughtExceptionHandler( + (t, e) -> LOG.error("Unexpected failure in {}", t.getName(), e)) + .setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME) + .build()); } @SuppressWarnings("FutureReturnValueIgnored") @@ -115,9 +132,41 @@ private void invalidateStuckCommits() { } } + /** Create {@link Heartbeats} and group them by {@link HeartbeatSender}. */ private void refreshActiveWork() { Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); + Map heartbeatsBySender = + aggregateHeartbeatsBySender(refreshDeadline); + if (heartbeatsBySender.isEmpty()) { + return; + } + + if (heartbeatsBySender.size() == 1) { + // If there is a single HeartbeatSender, just use the calling thread to send heartbeats. + Map.Entry heartbeat = + Iterables.getOnlyElement(heartbeatsBySender.entrySet()); + sendHeartbeat(heartbeat); + } else { + // If there are multiple HeartbeatSenders, send out the heartbeats in parallel using the + // fanOutActiveWorkRefreshExecutor. + List> fanOutRefreshActiveWork = new ArrayList<>(); + for (Map.Entry heartbeat : heartbeatsBySender.entrySet()) { + fanOutRefreshActiveWork.add( + CompletableFuture.runAsync( + () -> sendHeartbeat(heartbeat), fanOutActiveWorkRefreshExecutor)); + } + + // Don't block until we kick off all the refresh active work RPCs. + @SuppressWarnings("rawtypes") + CompletableFuture parallelFanOutRefreshActiveWork = + CompletableFuture.allOf(fanOutRefreshActiveWork.toArray(new CompletableFuture[0])); + parallelFanOutRefreshActiveWork.join(); + } + } + + /** Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. */ + private Map aggregateHeartbeatsBySender(Instant refreshDeadline) { Map heartbeatsBySender = new HashMap<>(); // Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. @@ -125,22 +174,30 @@ private void refreshActiveWork() { for (RefreshableWork work : computationState.getRefreshableWork(refreshDeadline)) { heartbeatsBySender .computeIfAbsent(work.heartbeatSender(), ignored -> Heartbeats.builder()) - .addWork(work) - .addHeartbeatRequest(computationState.getComputationId(), createHeartbeatRequest(work)); + .add(computationState.getComputationId(), work, sampler); } } - heartbeatSender.accept( - heartbeatsBySender.entrySet().stream() - .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build()))); + return heartbeatsBySender.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build())); + } + + private void sendHeartbeat(Map.Entry heartbeat) { + try (AutoCloseable ignored = heartbeatTracker.trackHeartbeats(heartbeat.getValue().size())) { + HeartbeatSender sender = heartbeat.getKey(); + Heartbeats heartbeats = heartbeat.getValue(); + sender.sendHeartbeats(heartbeats); + } catch (Exception e) { + LOG.error( + "Unable to send {} heartbeats to {}.", + heartbeat.getValue().size(), + heartbeat.getKey(), + e); + } } - private HeartbeatRequest createHeartbeatRequest(RefreshableWork work) { - return HeartbeatRequest.newBuilder() - .setShardingKey(work.getShardedKey().shardingKey()) - .setWorkToken(work.id().workToken()) - .setCacheToken(work.id().cacheToken()) - .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) - .build(); + @FunctionalInterface + public interface HeartbeatTracker { + AutoCloseable trackHeartbeats(int numHeartbeats); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java index a03ff4b430979..b1c42618b09cb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java @@ -19,6 +19,7 @@ import java.util.Objects; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.sdk.annotations.Internal; import org.slf4j.Logger; @@ -49,15 +50,24 @@ public static FixedStreamHeartbeatSender create(GetDataStream getDataStream) { @Override public void sendHeartbeats(Heartbeats heartbeats) { - if (getDataStream.isShutdown()) { + String threadName = Thread.currentThread().getName(); + try { + String backendWorkerToken = getDataStream.backendWorkerToken(); + if (!backendWorkerToken.isEmpty()) { + // Decorate the thread name w/ the backendWorkerToken for debugging. Resets the thread's + // name after sending the heartbeats succeeds or fails. + Thread.currentThread().setName(threadName + "-" + backendWorkerToken); + } + getDataStream.refreshActiveWork(heartbeats.heartbeatRequests().asMap()); + } catch (WindmillStream.WindmillStreamShutdownException e) { LOG.warn( "Trying to refresh work w/ {} heartbeats on stream={} after work has moved off of worker." + " heartbeats", getDataStream.backendWorkerToken(), heartbeats.heartbeatRequests().size()); heartbeats.work().forEach(RefreshableWork::setFailed); - } else { - getDataStream.refreshActiveWork(heartbeats.heartbeatRequests().asMap()); + } finally { + Thread.currentThread().setName(threadName); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java index 3ee0090ebcaa8..06559344332ca 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java @@ -17,7 +17,11 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; -/** Interface for sending heartbeats. */ +/** + * Interface for sending heartbeats. + * + * @implNote Batching/grouping of heartbeats is performed by HeartbeatSender equality. + */ @FunctionalInterface public interface HeartbeatSender { /** diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java index cff65ca183257..78e9864f4eed3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; import com.google.auto.value.AutoValue; +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -45,21 +46,32 @@ public abstract static class Builder { abstract ImmutableList.Builder workBuilder(); - public final Builder addWork(RefreshableWork work) { + public final Builder add( + String computationId, RefreshableWork work, DataflowExecutionStateSampler sampler) { workBuilder().add(work); + addHeartbeatRequest(computationId, createHeartbeatRequest(work, sampler)); return this; } + private Windmill.HeartbeatRequest createHeartbeatRequest( + RefreshableWork work, DataflowExecutionStateSampler sampler) { + return Windmill.HeartbeatRequest.newBuilder() + .setShardingKey(work.getShardedKey().shardingKey()) + .setWorkToken(work.id().workToken()) + .setCacheToken(work.id().cacheToken()) + .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) + .build(); + } + abstract Builder setHeartbeatRequests( ImmutableListMultimap value); abstract ImmutableListMultimap.Builder heartbeatRequestsBuilder(); - public final Builder addHeartbeatRequest( + private void addHeartbeatRequest( String computationId, Windmill.HeartbeatRequest heartbeatRequest) { heartbeatRequestsBuilder().put(computationId, heartbeatRequest); - return this; } public abstract Heartbeats build(); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index 5406a72927393..b3f7467cdbd34 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -239,11 +239,6 @@ public String backendWorkerToken() { @Override public void shutdown() {} - @Override - public boolean isShutdown() { - return false; - } - @Override public void halfClose() { done.countDown(); @@ -315,11 +310,6 @@ public String backendWorkerToken() { @Override public void shutdown() {} - @Override - public boolean isShutdown() { - return false; - } - @Override public Windmill.KeyedGetDataResponse requestKeyedData( String computation, KeyedGetDataRequest request) { @@ -401,11 +391,6 @@ public String backendWorkerToken() { @Override public void shutdown() {} - @Override - public boolean isShutdown() { - return false; - } - @Override public RequestBatcher batcher() { return new RequestBatcher() { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index be1e1278a767b..0d2eb29975508 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -126,7 +126,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.Timer.Type; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WatermarkHold; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; @@ -316,20 +316,6 @@ private static ExecutableWork createMockWork( return createMockWork(shardedKey, workToken, computationId, ignored -> {}); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public KeyedGetDataResponse getStateData(String computation, KeyedGetDataRequest request) { - return KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private static ExecutableWork createMockWork( ShardedKey shardedKey, long workToken, Consumer processWorkFn) { return createMockWork(shardedKey, workToken, "computationId", processWorkFn); @@ -346,10 +332,7 @@ private static ExecutableWork createMockWork( .build(), Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( - computationId, - createMockGetDataClient(), - ignored -> {}, - mock(HeartbeatSender.class)), + computationId, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), processWorkFn); @@ -3422,7 +3405,7 @@ public void testLatencyAttributionProtobufsPopulated() { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), clock, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 9f8e4c2dfc140..2bd6621dd4f44 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -61,7 +61,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -135,26 +135,11 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), + COMPUTATION_ID, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - @Test public void testTimerInternalsSetTimer() { Windmill.WorkItemCommitRequest.Builder outputBuilder = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java index 504b50daa3dce..98302c512256c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java @@ -95,7 +95,7 @@ import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader; import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader.NativeReaderIterator; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -199,26 +199,11 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), + COMPUTATION_ID, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private static class SourceProducingSubSourcesInSplit extends MockSource { int numDesiredBundle; int sourceObjectSize; @@ -1014,7 +999,7 @@ public void testFailedWorkItemsAbort() throws Exception { Watermarks.builder().setInputDataWatermark(new Instant(0)).build(), Work.createProcessingContext( COMPUTATION_ID, - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index 663edcbcdb75d..a373dffd1dc47 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -32,7 +32,7 @@ import java.util.Optional; import org.apache.beam.runners.dataflow.worker.streaming.ActiveWorkState.ActivateWorkResult; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -82,22 +82,7 @@ private static ExecutableWork expiredWork(Windmill.WorkItem workItem) { private static Work.ProcessingContext createWorkProcessingContext() { return Work.createProcessingContext( - "computationId", createMockGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)); - } - - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; + "computationId", new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)); } private static WorkId workId(long workToken, long cacheToken) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java index 658f12cf70ee0..1f70c24763255 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java @@ -36,7 +36,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; @@ -70,7 +70,7 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, @@ -78,21 +78,6 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, ignored -> {}); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - @Before public void setUp() { computationStateCache = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java index ef73d4b0ef27d..ad77958837a12 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java @@ -32,7 +32,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -68,7 +68,7 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, @@ -76,34 +76,17 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { executeWorkFn); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); + private Runnable createSleepProcessWorkFn(CountDownLatch start, CountDownLatch stop) { + return () -> { + start.countDown(); + try { + stop.await(); + } catch (Exception e) { + throw new RuntimeException(e); } }; } - private Runnable createSleepProcessWorkFn(CountDownLatch start, CountDownLatch stop) { - Runnable runnable = - () -> { - start.countDown(); - try { - stop.await(); - } catch (Exception e) { - throw new RuntimeException(e); - } - }; - return runnable; - } - @Before public void setUp() { this.executor = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java index 7e5b350b48323..bdad382c9af22 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java @@ -260,15 +260,5 @@ public String backendWorkerToken() { public void shutdown() { halfClose(); } - - @Override - public boolean isShutdown() { - return closed; - } - - @Override - public Type streamType() { - return Type.GET_DATA; - } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java index 37ab2c863c79b..51cd83d17fabf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java @@ -36,7 +36,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -66,7 +66,7 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> { throw new UnsupportedOperationException(); }, @@ -75,21 +75,6 @@ private static Work createMockWork(long workToken) { Collections.emptyList()); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java index 36d48d778e8cc..546a2883e3b20 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java @@ -50,13 +50,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; import org.joda.time.Instant; -import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -68,7 +67,7 @@ public class StreamingEngineWorkCommitterTest { @Rule public ErrorCollector errorCollector = new ErrorCollector(); - private StreamingEngineWorkCommitter workCommitter; + private WorkCommitter workCommitter; private FakeWindmillServer fakeWindmillServer; private Supplier> commitWorkStreamFactory; @@ -83,7 +82,7 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> { throw new UnsupportedOperationException(); }, @@ -92,21 +91,6 @@ private static Work createMockWork(long workToken) { Collections.emptyList()); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, @@ -135,14 +119,11 @@ public void setUp() throws IOException { ::getCloseableStream; } - @After - public void cleanUp() { - workCommitter.stop(); - } - - private StreamingEngineWorkCommitter createWorkCommitter( - Consumer onCommitComplete) { - return StreamingEngineWorkCommitter.create(commitWorkStreamFactory, 1, onCommitComplete); + private WorkCommitter createWorkCommitter(Consumer onCommitComplete) { + return StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory(commitWorkStreamFactory) + .setOnCommitComplete(onCommitComplete) + .build(); } @Test @@ -174,6 +155,8 @@ public void testCommit_sendsCommitsToStreamingEngine() { assertThat(request).isEqualTo(commit.request()); assertThat(completeCommits).contains(asCompleteCommit(commit, Windmill.CommitStatus.OK)); } + + workCommitter.stop(); } @Test @@ -214,6 +197,8 @@ public void testCommit_handlesFailedCommits() { .containsEntry(commit.work().getWorkItem().getWorkToken(), commit.request()); } } + + workCommitter.stop(); } @Test @@ -266,6 +251,8 @@ public void testCommit_handlesCompleteCommits_commitStatusNotOK() { .contains(asCompleteCommit(commit, expectedCommitStatus.get(commit.work().id()))); } assertThat(completeCommits.size()).isEqualTo(commits.size()); + + workCommitter.stop(); } @Test @@ -310,11 +297,6 @@ public String backendWorkerToken() { @Override public void shutdown() {} - - @Override - public boolean isShutdown() { - return false; - } }; commitWorkStreamFactory = @@ -359,7 +341,12 @@ public void testMultipleCommitSendersSingleStream() { ::getCloseableStream; Set completeCommits = Collections.newSetFromMap(new ConcurrentHashMap<>()); workCommitter = - StreamingEngineWorkCommitter.create(commitWorkStreamFactory, 5, completeCommits::add); + StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory(commitWorkStreamFactory) + .setNumCommitSenders(5) + .setOnCommitComplete(completeCommits::add) + .build(); + List commits = new ArrayList<>(); for (int i = 1; i <= 500; i++) { Work work = createMockWork(i); @@ -384,5 +371,7 @@ public void testMultipleCommitSendersSingleStream() { assertThat(request).isEqualTo(commit.request()); assertThat(completeCommits).contains(asCompleteCommit(commit, Windmill.CommitStatus.OK)); } + + workCommitter.stop(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java similarity index 56% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java index 76f6147b07434..ca89e9647153d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/WorkRefreshClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java @@ -17,11 +17,23 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; -import java.util.Map; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.Heartbeats; +import java.io.PrintWriter; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -/** Client for requesting work refresh via heartbeats. */ -public interface WorkRefreshClient { - void refreshActiveWork(Map heartbeats); +/** Fake {@link GetDataClient} implementation for testing. */ +public final class FakeGetDataClient implements GetDataClient { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computationId, Windmill.KeyedGetDataRequest request) throws GetDataException { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) + throws GetDataException { + return Windmill.GlobalData.getDefaultInstance(); + } + + @Override + public void printHtml(PrintWriter writer) {} } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java index b19e7f06896cb..d687434edff43 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java @@ -20,19 +20,15 @@ import static com.google.common.truth.Truth.assertThat; import static org.junit.Assert.assertFalse; import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; -import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker.Type; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -47,15 +43,14 @@ public class ThrottlingGetDataMetricTrackerTest { private final ExecutorService getDataProcessor = Executors.newCachedThreadPool(); @Test - public void testTrackSingleCallWithThrottling_STATE() throws InterruptedException { - doNothing().when(memoryMonitor).waitForResources(eq(Type.STATE.debugName())); + public void testTrackFetchStateDataWithThrottling() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); CountDownLatch processCall = new CountDownLatch(1); CountDownLatch callProcessing = new CountDownLatch(1); CountDownLatch processingDone = new CountDownLatch(1); getDataProcessor.submit( () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { callProcessing.countDown(); processCall.await(); } catch (Exception e) { @@ -65,7 +60,7 @@ public void testTrackSingleCallWithThrottling_STATE() throws InterruptedExceptio }); callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); @@ -76,7 +71,7 @@ public void testTrackSingleCallWithThrottling_STATE() throws InterruptedExceptio // decremented processCall.countDown(); processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); @@ -84,15 +79,14 @@ public void testTrackSingleCallWithThrottling_STATE() throws InterruptedExceptio } @Test - public void testTrackSingleCallWithThrottling_SIDE_INPUT() throws InterruptedException { - doNothing().when(memoryMonitor).waitForResources(eq(Type.SIDE_INPUT.debugName())); + public void testTrackSideInputFetchWithThrottling() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); CountDownLatch processCall = new CountDownLatch(1); CountDownLatch callProcessing = new CountDownLatch(1); CountDownLatch processingDone = new CountDownLatch(1); getDataProcessor.submit( () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(Type.SIDE_INPUT)) { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { callProcessing.countDown(); processCall.await(); } catch (Exception e) { @@ -102,7 +96,7 @@ public void testTrackSingleCallWithThrottling_SIDE_INPUT() throws InterruptedExc }); callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(0); @@ -113,96 +107,7 @@ public void testTrackSingleCallWithThrottling_SIDE_INPUT() throws InterruptedExc // decremented processCall.countDown(); processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = - getDataMetricTracker.getMetricsSnapshot(); - assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); - assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); - assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); - } - - @Test - public void testTrackSingleCallWithThrottling_HEARTBEAT() throws InterruptedException { - doNothing().when(memoryMonitor).waitForResources(eq(Type.HEARTBEAT.debugName())); - CountDownLatch processCall = new CountDownLatch(1); - CountDownLatch callProcessing = new CountDownLatch(1); - CountDownLatch processingDone = new CountDownLatch(1); - getDataProcessor.submit( - () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(Type.HEARTBEAT)) { - callProcessing.countDown(); - processCall.await(); - } catch (Exception e) { - // Do nothing. - } - processingDone.countDown(); - }); - - callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = - getDataMetricTracker.getMetricsSnapshot(); - - assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(0); - assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(1); - assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(0); - - // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets - // decremented - processCall.countDown(); - processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = - getDataMetricTracker.getMetricsSnapshot(); - assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); - assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); - assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); - } - - @Test - public void testTrackSingleCall_multipleThreads() throws InterruptedException { - doNothing().when(memoryMonitor).waitForResources(anyString()); - // Issuing 5 calls (1 from each thread) - // 2 State Reads - // 2 SideInput Reads - // 1 Heartbeat - List callTypes = - Lists.newArrayList( - Type.STATE, Type.SIDE_INPUT, Type.STATE, Type.HEARTBEAT, Type.SIDE_INPUT); - CountDownLatch processCall = new CountDownLatch(callTypes.size()); - CountDownLatch callProcessing = new CountDownLatch(callTypes.size()); - CountDownLatch processingDone = new CountDownLatch(callTypes.size()); - for (Type callType : callTypes) { - getDataProcessor.submit( - () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(callType)) { - callProcessing.countDown(); - processCall.await(); - } catch (Exception e) { - // Do nothing. - } - processingDone.countDown(); - }); - } - - callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = - getDataMetricTracker.getMetricsSnapshot(); - - // Asserting that metrics reflects: - // 2 State Reads - // 2 SideInput Reads - // 1 Heartbeat - assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(2); - assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(2); - assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(1); - - // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets - // decremented - for (int i = 0; i < callTypes.size(); i++) { - processCall.countDown(); - } - processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); @@ -217,8 +122,7 @@ public void testThrottledTrackSingleCallWithThrottling() throws InterruptedExcep CountDownLatch processingDone = new CountDownLatch(1); getDataProcessor.submit( () -> { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { callProcessing.countDown(); processCall.await(); } catch (Exception e) { @@ -228,7 +132,7 @@ public void testThrottledTrackSingleCallWithThrottling() throws InterruptedExcep }); assertFalse(callProcessing.await(10, TimeUnit.MILLISECONDS)); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsBeforeProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsBeforeProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsBeforeProcessing.activeStateReads()).isEqualTo(0); assertThat(metricsBeforeProcessing.activeHeartbeats()).isEqualTo(0); @@ -237,7 +141,7 @@ public void testThrottledTrackSingleCallWithThrottling() throws InterruptedExcep // Stop throttling. mockThrottler.countDown(); callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); @@ -246,7 +150,7 @@ public void testThrottledTrackSingleCallWithThrottling() throws InterruptedExcep // decremented processCall.countDown(); processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); } @@ -263,8 +167,7 @@ public void testTrackSingleCall_exceptionThrown() throws InterruptedException { getDataProcessor.submit( () -> { try { - try (AutoCloseable ignored = - getDataMetricTracker.trackSingleCallWithThrottling(Type.STATE)) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { callProcessing.countDown(); beforeException.await(); throw new RuntimeException("something bad happened"); @@ -277,7 +180,7 @@ public void testTrackSingleCall_exceptionThrown() throws InterruptedException { callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); @@ -285,7 +188,7 @@ public void testTrackSingleCall_exceptionThrown() throws InterruptedException { // In the midst of an exception, close() should still run. afterException.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); } @@ -308,7 +211,7 @@ public void testTrackHeartbeats() throws InterruptedException { }); callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(5); @@ -317,7 +220,7 @@ public void testTrackHeartbeats() throws InterruptedException { // decremented processCall.countDown(); processingDone.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); } @@ -346,7 +249,7 @@ public void testTrackHeartbeats_exceptionThrown() throws InterruptedException { callProcessing.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsWhileProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(numHeartbeats); @@ -354,7 +257,7 @@ public void testTrackHeartbeats_exceptionThrown() throws InterruptedException { // In the midst of an exception, close() should still run. afterException.await(); - ThrottlingGetDataMetricTracker.GetDataMetrics.ReadOnlySnapshot metricsAfterProcessing = + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = getDataMetricTracker.getMetricsSnapshot(); assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java index ea90bb276a4bb..146b05bb7e35f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java @@ -35,7 +35,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -89,7 +89,7 @@ private static ExecutableWork createWork(Supplier clock, Consumer Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - createMockGetDataClient(), + new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), clock, @@ -97,21 +97,6 @@ private static ExecutableWork createWork(Supplier clock, Consumer processWorkFn); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private static ExecutableWork createWork(Consumer processWorkFn) { return createWork(Instant::now, processWorkFn); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index dbd5959293167..9dce3392c60c5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -47,13 +47,12 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.direct.Clock; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBasedTable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Table; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; @@ -61,6 +60,7 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.mockito.ArgumentCaptor; @RunWith(JUnit4.class) public class ActiveWorkRefresherTest { @@ -94,27 +94,12 @@ private static ComputationState createComputationState( stateCache); } - private static GetDataClient createMockGetDataClient() { - return new GetDataClient() { - @Override - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - return Windmill.KeyedGetDataResponse.getDefaultInstance(); - } - - @Override - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - return Windmill.GlobalData.getDefaultInstance(); - } - }; - } - private ActiveWorkRefresher createActiveWorkRefresher( Supplier clock, int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, - Consumer> activeWorkRefresherFn) { + ActiveWorkRefresher.HeartbeatTracker heartbeatTracker) { return new ActiveWorkRefresher( clock, activeWorkRefreshPeriodMillis, @@ -122,7 +107,7 @@ private ActiveWorkRefresher createActiveWorkRefresher( computations, DataflowExecutionStateSampler.instance(), Executors.newSingleThreadScheduledExecutor(), - activeWorkRefresherFn); + heartbeatTracker); } private ExecutableWork createOldWork(int workIds, Consumer processWork) { @@ -142,7 +127,7 @@ private ExecutableWork createOldWork( .build(), Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( - "computationId", createMockGetDataClient(), ignored -> {}, heartbeatSender), + "computationId", new FakeGetDataClient(), ignored -> {}, heartbeatSender), A_LONG_TIME_AGO, ImmutableList.of()), processWork); @@ -177,7 +162,6 @@ public void testActiveWorkRefresh() throws InterruptedException { activeWorkForComputation.add(fakeWork); } - Map fanoutExpectedHeartbeats = new HashMap<>(); CountDownLatch heartbeatsSent = new CountDownLatch(1); TestClock fakeClock = new TestClock(Instant.now()); ActiveWorkRefresher activeWorkRefresher = @@ -186,40 +170,36 @@ public void testActiveWorkRefresh() throws InterruptedException { activeWorkRefreshPeriodMillis, 0, () -> computations, - heartbeats -> { - fanoutExpectedHeartbeats.putAll(heartbeats); - heartbeatsSent.countDown(); - }); + heartbeats -> heartbeatsSent::countDown); + ArgumentCaptor heartbeatsCaptor = ArgumentCaptor.forClass(Heartbeats.class); activeWorkRefresher.start(); fakeClock.advance(Duration.millis(activeWorkRefreshPeriodMillis * 2)); heartbeatsSent.await(); activeWorkRefresher.stop(); - + verify(heartbeatSender).sendHeartbeats(heartbeatsCaptor.capture()); + Heartbeats fanoutExpectedHeartbeats = heartbeatsCaptor.getValue(); assertThat(computationsAndWork.size()) - .isEqualTo( - Iterables.getOnlyElement(fanoutExpectedHeartbeats.values()).heartbeatRequests().size()); - for (Map.Entry fanOutExpectedHeartbeat : - fanoutExpectedHeartbeats.entrySet()) { - for (Map.Entry> expectedHeartbeat : - fanOutExpectedHeartbeat.getValue().heartbeatRequests().asMap().entrySet()) { - String computationId = expectedHeartbeat.getKey(); - Collection heartbeatRequests = expectedHeartbeat.getValue(); - List work = - computationsAndWork.get(computationId).stream() - .map(ExecutableWork::work) - .collect(Collectors.toList()); - // Compare the heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys. - assertThat(heartbeatRequests) - .comparingElementsUsing( - Correspondence.from( - (Windmill.HeartbeatRequest h, Work w) -> - h.getWorkToken() == w.getWorkItem().getWorkToken() - && h.getCacheToken() == w.getWorkItem().getWorkToken() - && h.getShardingKey() == w.getWorkItem().getShardingKey(), - "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) - .containsExactlyElementsIn(work); - } + .isEqualTo(fanoutExpectedHeartbeats.heartbeatRequests().size()); + + for (Map.Entry> expectedHeartbeat : + fanoutExpectedHeartbeats.heartbeatRequests().asMap().entrySet()) { + String computationId = expectedHeartbeat.getKey(); + Collection heartbeatRequests = expectedHeartbeat.getValue(); + List work = + computationsAndWork.get(computationId).stream() + .map(ExecutableWork::work) + .collect(Collectors.toList()); + // Compare the heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys. + assertThat(heartbeatRequests) + .comparingElementsUsing( + Correspondence.from( + (Windmill.HeartbeatRequest h, Work w) -> + h.getWorkToken() == w.getWorkItem().getWorkToken() + && h.getCacheToken() == w.getWorkItem().getWorkToken() + && h.getShardingKey() == w.getWorkItem().getShardingKey(), + "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) + .containsExactlyElementsIn(work); } activeWorkRefresher.stop(); @@ -265,7 +245,7 @@ public void testInvalidateStuckCommits() throws InterruptedException { 0, stuckCommitDurationMillis, computations.rowMap()::keySet, - ignored -> {}); + ignored -> () -> {}); activeWorkRefresher.start(); fakeClock.advance(Duration.millis(stuckCommitDurationMillis)); From 908a0e0876296885b64773bf1705d7b41f7ec0e0 Mon Sep 17 00:00:00 2001 From: Martin Trieu Date: Tue, 30 Jul 2024 16:12:36 +0900 Subject: [PATCH 7/7] address PR comments --- .../worker/StreamingDataflowWorker.java | 6 +-- .../worker/streaming/ActiveWorkState.java | 3 +- .../client/AbstractWindmillStream.java | 6 +++ .../windmill/client/WindmillStream.java | 6 --- .../commits/StreamingEngineWorkCommitter.java | 8 +-- .../client/getdata/StreamGetDataClient.java | 6 +-- .../getdata/StreamPoolGetDataClient.java | 3 +- .../ThrottlingGetDataMetricTracker.java | 5 +- .../work/refresh/ActiveWorkRefresher.java | 49 +++++++------------ .../refresh/FixedStreamHeartbeatSender.java | 14 ++++-- .../windmill/work/refresh/Heartbeats.java | 29 ++++------- 11 files changed, 58 insertions(+), 77 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index c30e1620a5e11..0c51c381b3600 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -198,6 +198,9 @@ private StreamingDataflowWorker( GrpcWindmillStreamFactory windmillStreamFactory, Function executorSupplier, ConcurrentMap stageInfoMap) { + // Register standard file systems. + FileSystems.setDefaultPipelineOptions(options); + this.configFetcher = configFetcher; this.computationStateCache = computationStateCache; this.stateCache = windmillStateCache; @@ -331,9 +334,6 @@ private StreamingDataflowWorker( ID_GENERATOR, stageInfoMap); - // Register standard file systems. - FileSystems.setDefaultPipelineOptions(options); - LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); LOG.debug("WindmillServicePort: {}", options.getWindmillServicePort()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 56b0e3f539a50..c80c3a882e528 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -208,8 +208,7 @@ synchronized ImmutableListMultimap getReadOnlyActiv Entry::getKey, e -> e.getValue().stream() - .map(ExecutableWork::work) - .map(work -> (RefreshableWork) work))); + .map(executableWork -> (RefreshableWork) executableWork.work()))); } synchronized ImmutableList getRefreshableWork(Instant refreshDeadline) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index fd0d1b1a3a92d..58aecfc71e00f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -290,6 +290,12 @@ private void setLastError(String error) { lastErrorTime.set(DateTime.now()); } + public static class WindmillStreamShutdownException extends RuntimeException { + public WindmillStreamShutdownException(String message) { + super(message); + } + } + private class ResponseObserver implements StreamObserver { @Override diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index ee467c01c8f6e..31bd4e146a78d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -114,10 +114,4 @@ default void close() { /** Interface for streaming GetWorkerMetadata requests to Windmill. */ @ThreadSafe interface GetWorkerMetadataStream extends WindmillStream {} - - class WindmillStreamShutdownException extends RuntimeException { - public WindmillStreamShutdownException(String message) { - super(message); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java index afdb29560a2b2..bf1007bc4bfbf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java @@ -47,6 +47,7 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { private static final Logger LOG = LoggerFactory.getLogger(StreamingEngineWorkCommitter.class); private static final int TARGET_COMMIT_BATCH_KEYS = 5; private static final int MAX_COMMIT_QUEUE_BYTES = 500 << 20; // 500MB + private static final String NO_BACKEND_WORKER_TOKEN = ""; private final Supplier> commitWorkStreamFactory; private final WeightedBoundedQueue commitQueue; @@ -84,7 +85,7 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { public static Builder builder() { return new AutoBuilder_StreamingEngineWorkCommitter_Builder() - .setBackendWorkerToken("") + .setBackendWorkerToken(NO_BACKEND_WORKER_TOKEN) .setNumCommitSenders(1); } @@ -222,7 +223,7 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch */ private @Nullable Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { int commits = 1; - while (isRunning.get()) { + while (true) { Commit commit; try { if (commits < TARGET_COMMIT_BATCH_KEYS) { @@ -231,6 +232,7 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch commit = commitQueue.poll(); } } catch (InterruptedException e) { + Thread.currentThread().interrupt(); return null; } @@ -249,8 +251,6 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch } commits++; } - - return null; } @AutoBuilder diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java index b0625384641e2..c8e058e7e2307 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java @@ -21,7 +21,7 @@ import java.util.function.Function; import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.sdk.annotations.Internal; @@ -61,7 +61,7 @@ public Windmill.KeyedGetDataResponse getStateData( String computationId, Windmill.KeyedGetDataRequest request) throws GetDataException { try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { return getDataStream.requestKeyedData(computationId, request); - } catch (WindmillStream.WindmillStreamShutdownException e) { + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { throw new WorkItemCancelledException(request.getShardingKey()); } catch (Exception e) { throw new GetDataException( @@ -86,7 +86,7 @@ public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) sideInputGetDataStreamFactory.apply(request.getDataId().getTag()); try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { return sideInputGetDataStream.requestGlobalData(request); - } catch (WindmillStream.WindmillStreamShutdownException e) { + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { throw new WorkItemCancelledException(e); } catch (Exception e) { throw new GetDataException( diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java index d6b20e425b0ba..49fe3e4bdc15f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java @@ -30,8 +30,7 @@ /** * StreamingEngine implementation of {@link GetDataClient}. * - * @implNote Uses {@link WindmillStreamPool} to send requests. Depending on options, may use a - * dedicated stream pool for heartbeats. + * @implNote Uses {@link WindmillStreamPool} to send requests. */ @Internal @ThreadSafe diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java index a66cf932bd742..6bb00292e29a5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java @@ -72,9 +72,8 @@ AutoCloseable trackSideInputFetchWithThrottling() { * metric after the call is finished. */ public AutoCloseable trackHeartbeats(int numHeartbeats) { - activeHeartbeats.getAndUpdate( - currentActiveHeartbeats -> currentActiveHeartbeats + numHeartbeats); - return () -> activeHeartbeats.getAndUpdate(existing -> existing - numHeartbeats); + activeHeartbeats.getAndAdd(numHeartbeats); + return () -> activeHeartbeats.getAndAdd(-numHeartbeats); } public void printHtml(PrintWriter writer) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index c4dc375cdb020..499d2e5b6943c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -30,12 +30,12 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; +import javax.annotation.Nullable; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; import org.joda.time.Instant; @@ -82,13 +82,7 @@ public ActiveWorkRefresher( this.heartbeatTracker = heartbeatTracker; this.fanOutActiveWorkRefreshExecutor = Executors.newCachedThreadPool( - new ThreadFactoryBuilder() - // Work refresh runs as a background process, don't let failures crash - // the worker. - .setUncaughtExceptionHandler( - (t, e) -> LOG.error("Unexpected failure in {}", t.getName(), e)) - .setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME) - .build()); + new ThreadFactoryBuilder().setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME).build()); } @SuppressWarnings("FutureReturnValueIgnored") @@ -132,37 +126,28 @@ private void invalidateStuckCommits() { } } - /** Create {@link Heartbeats} and group them by {@link HeartbeatSender}. */ private void refreshActiveWork() { Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); Map heartbeatsBySender = aggregateHeartbeatsBySender(refreshDeadline); - if (heartbeatsBySender.isEmpty()) { - return; - } + List> fanOutRefreshActiveWork = new ArrayList<>(); - if (heartbeatsBySender.size() == 1) { - // If there is a single HeartbeatSender, just use the calling thread to send heartbeats. - Map.Entry heartbeat = - Iterables.getOnlyElement(heartbeatsBySender.entrySet()); - sendHeartbeat(heartbeat); - } else { - // If there are multiple HeartbeatSenders, send out the heartbeats in parallel using the - // fanOutActiveWorkRefreshExecutor. - List> fanOutRefreshActiveWork = new ArrayList<>(); - for (Map.Entry heartbeat : heartbeatsBySender.entrySet()) { + // Send the first heartbeat on the calling thread, and fan out the rest via the + // fanOutActiveWorkRefreshExecutor. + @Nullable Map.Entry firstHeartbeat = null; + for (Map.Entry heartbeat : heartbeatsBySender.entrySet()) { + if (firstHeartbeat == null) { + firstHeartbeat = heartbeat; + } else { fanOutRefreshActiveWork.add( CompletableFuture.runAsync( - () -> sendHeartbeat(heartbeat), fanOutActiveWorkRefreshExecutor)); + () -> sendHeartbeatSafely(heartbeat), fanOutActiveWorkRefreshExecutor)); } - - // Don't block until we kick off all the refresh active work RPCs. - @SuppressWarnings("rawtypes") - CompletableFuture parallelFanOutRefreshActiveWork = - CompletableFuture.allOf(fanOutRefreshActiveWork.toArray(new CompletableFuture[0])); - parallelFanOutRefreshActiveWork.join(); } + + sendHeartbeatSafely(firstHeartbeat); + fanOutRefreshActiveWork.forEach(CompletableFuture::join); } /** Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. */ @@ -182,7 +167,11 @@ private Map aggregateHeartbeatsBySender(Instant ref .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build())); } - private void sendHeartbeat(Map.Entry heartbeat) { + /** + * Send the {@link Heartbeats} using the {@link HeartbeatSender}. Safe since exceptions are caught + * and logged. + */ + private void sendHeartbeatSafely(Map.Entry heartbeat) { try (AutoCloseable ignored = heartbeatTracker.trackHeartbeats(heartbeat.getValue().size())) { HeartbeatSender sender = heartbeat.getKey(); Heartbeats heartbeats = heartbeat.getValue(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java index b1c42618b09cb..33a55d1927f8b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java @@ -18,8 +18,9 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; import java.util.Objects; +import javax.annotation.Nullable; import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.sdk.annotations.Internal; import org.slf4j.Logger; @@ -50,16 +51,17 @@ public static FixedStreamHeartbeatSender create(GetDataStream getDataStream) { @Override public void sendHeartbeats(Heartbeats heartbeats) { - String threadName = Thread.currentThread().getName(); + @Nullable String originalThreadName = null; try { String backendWorkerToken = getDataStream.backendWorkerToken(); if (!backendWorkerToken.isEmpty()) { // Decorate the thread name w/ the backendWorkerToken for debugging. Resets the thread's // name after sending the heartbeats succeeds or fails. - Thread.currentThread().setName(threadName + "-" + backendWorkerToken); + originalThreadName = Thread.currentThread().getName(); + Thread.currentThread().setName(originalThreadName + "-" + backendWorkerToken); } getDataStream.refreshActiveWork(heartbeats.heartbeatRequests().asMap()); - } catch (WindmillStream.WindmillStreamShutdownException e) { + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { LOG.warn( "Trying to refresh work w/ {} heartbeats on stream={} after work has moved off of worker." + " heartbeats", @@ -67,7 +69,9 @@ public void sendHeartbeats(Heartbeats heartbeats) { heartbeats.heartbeatRequests().size()); heartbeats.work().forEach(RefreshableWork::setFailed); } finally { - Thread.currentThread().setName(threadName); + if (originalThreadName != null) { + Thread.currentThread().setName(originalThreadName); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java index 78e9864f4eed3..071bf7fa3d432 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java @@ -26,7 +26,7 @@ /** Heartbeat requests and the work that was used to generate the heartbeat requests. */ @AutoValue -public abstract class Heartbeats { +abstract class Heartbeats { static Heartbeats.Builder builder() { return new AutoValue_Heartbeats.Builder(); @@ -34,22 +34,24 @@ static Heartbeats.Builder builder() { abstract ImmutableList work(); - public abstract ImmutableListMultimap heartbeatRequests(); + abstract ImmutableListMultimap heartbeatRequests(); - public final int size() { + final int size() { return heartbeatRequests().asMap().size(); } @AutoValue.Builder - public abstract static class Builder { - abstract Builder setWork(ImmutableList value); + abstract static class Builder { abstract ImmutableList.Builder workBuilder(); - public final Builder add( + abstract ImmutableListMultimap.Builder + heartbeatRequestsBuilder(); + + final Builder add( String computationId, RefreshableWork work, DataflowExecutionStateSampler sampler) { workBuilder().add(work); - addHeartbeatRequest(computationId, createHeartbeatRequest(work, sampler)); + heartbeatRequestsBuilder().put(computationId, createHeartbeatRequest(work, sampler)); return this; } @@ -63,17 +65,6 @@ private Windmill.HeartbeatRequest createHeartbeatRequest( .build(); } - abstract Builder setHeartbeatRequests( - ImmutableListMultimap value); - - abstract ImmutableListMultimap.Builder - heartbeatRequestsBuilder(); - - private void addHeartbeatRequest( - String computationId, Windmill.HeartbeatRequest heartbeatRequest) { - heartbeatRequestsBuilder().put(computationId, heartbeatRequest); - } - - public abstract Heartbeats build(); + abstract Heartbeats build(); } }