Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-11200. Hsync client-side metrics #7371

Merged
merged 2 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.ozone.OzoneConsts;

import java.util.Map;
Expand All @@ -52,6 +53,17 @@ public final class ContainerClientMetrics {
private MutableCounterLong totalWriteChunkCalls;
@Metric
private MutableCounterLong totalWriteChunkBytes;

@Metric
private MutableRate hsyncSynchronizedWorkNs;
@Metric
private MutableRate hsyncSendWriteChunkNs;
@Metric
private MutableRate hsyncWaitForFlushNs;
@Metric
private MutableRate hsyncWatchForCommitNs;
jojochuang marked this conversation as resolved.
Show resolved Hide resolved


private MutableQuantiles[] listBlockLatency;
private MutableQuantiles[] getBlockLatency;
private MutableQuantiles[] getCommittedBlockLengthLatency;
Expand Down Expand Up @@ -249,4 +261,20 @@ Map<PipelineID, MutableCounterLong> getWriteChunkCallsByPipeline() {
Map<UUID, MutableCounterLong> getWriteChunksCallsByLeaders() {
return writeChunksCallsByLeaders;
}

public MutableRate getHsyncSynchronizedWorkNs() {
return hsyncSynchronizedWorkNs;
}

public MutableRate getHsyncSendWriteChunkNs() {
return hsyncSendWriteChunkNs;
}

public MutableRate getHsyncWaitForFlushNs() {
return hsyncWaitForFlushNs;
}

public MutableRate getHsyncWatchForCommitNs() {
return hsyncWatchForCommitNs;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@
import static org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.putBlockAsync;
import static org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.writeChunkAsync;
import static org.apache.hadoop.ozone.OzoneConsts.INCREMENTAL_CHUNK_LIST;
import static org.apache.hadoop.ozone.util.MetricUtil.captureLatencyNs;

import org.apache.hadoop.util.Time;
import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -532,12 +534,16 @@ private CompletableFuture<Void> watchForCommit(long commitIndex) {
}

LOG.debug("Entering watchForCommit commitIndex = {}", commitIndex);
final long start = Time.monotonicNowNanos();
return sendWatchForCommit(commitIndex)
.thenAccept(this::checkReply)
.exceptionally(e -> {
throw new FlushRuntimeException(setIoException(e));
})
.whenComplete((r, e) -> LOG.debug("Leaving watchForCommit commitIndex = {}", commitIndex));
.whenComplete((r, e) -> {
LOG.debug("Leaving watchForCommit commitIndex = {}", commitIndex);
clientMetrics.getHsyncWatchForCommitNs().add(Time.monotonicNowNanos() - start);
});
}

private void checkReply(XceiverClientReply reply) {
Expand Down Expand Up @@ -693,12 +699,15 @@ private void handleFlushInternal(boolean close)
throws IOException, InterruptedException, ExecutionException {
checkOpen();
LOG.debug("Start handleFlushInternal close={}", close);
CompletableFuture<Void> toWaitFor = handleFlushInternalSynchronized(close);
CompletableFuture<Void> toWaitFor = captureLatencyNs(clientMetrics.getHsyncSynchronizedWorkNs(),
() -> handleFlushInternalSynchronized(close));

if (toWaitFor != null) {
LOG.debug("Waiting for flush");
try {
long startWaiting = Time.monotonicNowNanos();
toWaitFor.get();
clientMetrics.getHsyncWaitForFlushNs().add(Time.monotonicNowNanos() - startWaiting);
} catch (ExecutionException ex) {
if (ex.getCause() instanceof FlushRuntimeException) {
throw ((FlushRuntimeException) ex.getCause()).cause;
Expand Down Expand Up @@ -727,6 +736,7 @@ public void waitForAllPendingFlushes() throws IOException {
}

private synchronized CompletableFuture<Void> handleFlushInternalSynchronized(boolean close) throws IOException {
long start = Time.monotonicNowNanos();
CompletableFuture<PutBlockResult> putBlockResultFuture = null;
// flush the last chunk data residing on the currentBuffer
if (totalWriteChunkLength < writtenDataLength) {
Expand Down Expand Up @@ -768,6 +778,7 @@ private synchronized CompletableFuture<Void> handleFlushInternalSynchronized(boo
if (putBlockResultFuture != null) {
recordWatchForCommitAsync(putBlockResultFuture);
}
clientMetrics.getHsyncSendWriteChunkNs().add(Time.monotonicNowNanos() - start);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this approach can miss the invocations that throws exception.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's ok, we want to see normal metrics. This method is long, I didn't wanna add indentation to it.

return lastFlushFuture;
}

Expand Down
Loading