Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-11367. Improve ozone balancing status command output #7139

Merged
merged 45 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
774fbbc
HDDS-11367. Improve ozone balancing status command output
juncevich Aug 30, 2024
00ad471
HDDS-11367. Improve ozone balancing status command output
juncevich Sep 1, 2024
bac5794
HDDS-11367. Add space between Gb and Mb in getPrettySize
juncevich Sep 19, 2024
9dbd39c
HDDS-11367. Fixe review notices + add unit tests
juncevich Oct 3, 2024
4f6af05
Merge branch 'master' into HDDS-11367
juncevich Oct 3, 2024
570025f
HDDS-11367. Small fixes + improve robot test
juncevich Oct 7, 2024
0c04a2a
HDDS-11367. Add licences to files
juncevich Oct 8, 2024
f1f11ec
HDDS-11367. Fix review notice
juncevich Oct 8, 2024
68766f9
HDDS-11367. Improve balancer robot test
juncevich Oct 8, 2024
4138ad7
HDDS-11367. Fix review notices
juncevich Oct 10, 2024
4d916a5
HDDS-11367. Fix review notices
juncevich Oct 15, 2024
75a103c
HDDS-11367. Fix review notices
juncevich Oct 16, 2024
33af9b1
HDDS-11367. Improve javadocs
juncevich Oct 17, 2024
8ff20ee
HDDS-11367. fix timeouts in balancer test
juncevich Oct 17, 2024
4a7d8cd
Merge branch 'master' into HDDS-11367
juncevich Oct 18, 2024
5284e6b
HDDS-11367. fix robo test
juncevich Oct 18, 2024
051ea1d
HDDS-11367. fix robo test
juncevich Oct 18, 2024
49b6e14
HDDS-11367. fix robo test
juncevich Oct 19, 2024
1513b9c
HDDS-11367. fix robo test
juncevich Oct 19, 2024
16683a2
HDDS-11367. fix robo test
juncevich Oct 19, 2024
899f314
HDDS-11367. fix robo test
juncevich Oct 20, 2024
c8e21d9
HDDS-11367. fix review notices
juncevich Oct 24, 2024
b0cb760
HDDS-11367. Fix review comments
juncevich Oct 25, 2024
d682ccc
HDDS-11367. Add licence info to classes
juncevich Oct 25, 2024
2c4e1f4
HDDS-11367. Partly fix review notices
juncevich Oct 28, 2024
ad0c95d
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
4243f98
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
e9080b3
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
dabd4c7
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
c6562fc
HDDS-11367. Fix tests
juncevich Nov 1, 2024
361ba74
HDDS-11367. Fix tests
juncevich Nov 1, 2024
78caa97
HDDS-11367. Fix tests
juncevich Nov 1, 2024
6a68f45
Merge branch 'master' into HDDS-11367
juncevich Nov 2, 2024
bae2a6b
HDDS-11367. Fix tests
juncevich Nov 2, 2024
bea5d60
HDDS-11367. Fix tests
juncevich Nov 4, 2024
9ae7527
HDDS-11367. Fix tests
juncevich Nov 4, 2024
9e8213b
HDDS-11367. Add tests
juncevich Nov 5, 2024
786fc4f
HDDS-11367. Fix review comments.
juncevich Nov 20, 2024
e77ea14
HDDS-11367. Fix review comments.
juncevich Nov 20, 2024
6ccd658
HDDS-11367. Fix flaky TestContainerBalancerStatusInfo.testGetCurrentS…
juncevich Nov 22, 2024
6acad2b
Merge branch 'master' into HDDS-11367
juncevich Nov 25, 2024
15a8558
HDDS-11367. Remove unstarted balancing iteration
juncevich Nov 25, 2024
2e60bf3
HDDS-11367. Fix review notice. Refactor saving iteration statistic.
juncevich Nov 25, 2024
e41c219
Merge branch 'master' into HDDS-11367
juncevich Dec 4, 2024
4ff34c7
HDDS-11367. Fix review flaky test.
juncevich Dec 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -627,19 +627,20 @@ message ContainerBalancerStatusInfo {
message ContainerBalancerTaskIterationStatusInfo {
juncevich marked this conversation as resolved.
Show resolved Hide resolved
optional int32 iterationNumber = 1;
optional string iterationResult = 2;
optional int64 sizeScheduledForMoveGB = 3;
optional int64 dataSizeMovedGB = 4;
optional int64 sizeScheduledForMove = 3;
optional int64 dataSizeMoved = 4;
optional int64 containerMovesScheduled = 5;
optional int64 containerMovesCompleted = 6;
optional int64 containerMovesFailed = 7;
optional int64 containerMovesTimeout = 8;
repeated NodeTransferInfo sizeEnteringNodesGB = 9;
repeated NodeTransferInfo sizeLeavingNodesGB = 10;
repeated NodeTransferInfo sizeEnteringNodes = 9;
repeated NodeTransferInfo sizeLeavingNodes = 10;
optional int64 iterationDuration = 11;
}

message NodeTransferInfo {
optional string uuid = 1;
optional int64 dataVolumeGB = 2;
optional int64 dataVolume = 2;
}

message DecommissionScmRequestProto {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ public final class ContainerBalancerMetrics {
" in the latest iteration.")
private MutableCounterLong dataSizeMovedGBInLatestIteration;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this metric now? Or is it for backward compatibility?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For backward compatibility.


@Metric(about = "Amount of bytes that Container Balancer moved" +
juncevich marked this conversation as resolved.
Show resolved Hide resolved
" in the latest iteration.")
private MutableCounterLong dataSizeMovedBytesInLatestIteration;

@Metric(about = "Number of completed container moves performed by " +
"Container Balancer in the latest iteration.")
private MutableCounterLong numContainerMovesCompletedInLatestIteration;
Expand Down Expand Up @@ -154,6 +158,24 @@ public void resetDataSizeMovedGBInLatestIteration() {
-getDataSizeMovedGBInLatestIteration());
}

/**
* Gets the amount of data moved by Container Balancer in the latest
juncevich marked this conversation as resolved.
Show resolved Hide resolved
* iteration.
* @return size in bytes
*/
public long getDataSizeMovedInLatestIteration() {
return dataSizeMovedBytesInLatestIteration.value();
}

public void incrementDataSizeMovedInLatestIteration(long valueToAdd) {
juncevich marked this conversation as resolved.
Show resolved Hide resolved
this.dataSizeMovedBytesInLatestIteration.incr(valueToAdd);
}

public void resetDataSizeMovedInLatestIteration() {
juncevich marked this conversation as resolved.
Show resolved Hide resolved
dataSizeMovedBytesInLatestIteration.incr(
juncevich marked this conversation as resolved.
Show resolved Hide resolved
-getDataSizeMovedInLatestIteration());
}

/**
* Gets the number of container moves performed by Container Balancer in the
* latest iteration.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@
package org.apache.hadoop.hdds.scm.container.balancer;

import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;

import java.time.OffsetDateTime;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

/**
* Info about balancer status.
Expand Down Expand Up @@ -51,4 +54,49 @@ public HddsProtos.ContainerBalancerConfigurationProto getConfiguration() {
public List<ContainerBalancerTaskIterationStatusInfo> getIterationsStatusInfo() {
return iterationsStatusInfo;
}

public StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfo toProto() {
juncevich marked this conversation as resolved.
Show resolved Hide resolved
return StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfo
.newBuilder()
.setStartedAt(getStartedAt().toEpochSecond())
.setConfiguration(getConfiguration())
.addAllIterationsStatusInfo(
getIterationsStatusInfo()
.stream()
.map(
info -> StorageContainerLocationProtocolProtos.ContainerBalancerTaskIterationStatusInfo.newBuilder()
juncevich marked this conversation as resolved.
Show resolved Hide resolved
.setIterationNumber(info.getIterationNumber())
.setIterationResult(Optional.ofNullable(info.getIterationResult()).orElse(""))
.setIterationDuration(info.getIterationDuration())
.setSizeScheduledForMove(info.getSizeScheduledForMove())
.setDataSizeMoved(info.getDataSizeMoved())
.setContainerMovesScheduled(info.getContainerMovesScheduled())
.setContainerMovesCompleted(info.getContainerMovesCompleted())
.setContainerMovesFailed(info.getContainerMovesFailed())
.setContainerMovesTimeout(info.getContainerMovesTimeout())
.addAllSizeEnteringNodes(
info.getSizeEnteringNodes().entrySet()
.stream()
.map(entry -> StorageContainerLocationProtocolProtos.NodeTransferInfo.newBuilder()
.setUuid(entry.getKey().toString())
.setDataVolume(entry.getValue())
.build()
)
.collect(Collectors.toList())
)
.addAllSizeLeavingNodes(
info.getSizeLeavingNodes().entrySet()
.stream()
.map(entry -> StorageContainerLocationProtocolProtos.NodeTransferInfo.newBuilder()
.setUuid(entry.getKey().toString())
.setDataVolume(entry.getValue())
.build()
)
.collect(Collectors.toList())
)
.build()
)
.collect(Collectors.toList())
).build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import java.io.IOException;
import java.time.Duration;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand All @@ -60,6 +61,7 @@
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;

import static java.time.OffsetDateTime.now;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL_DEFAULT;

Expand Down Expand Up @@ -118,7 +120,7 @@ public class ContainerBalancerTask implements Runnable {
private int nextIterationIndex;
private boolean delayStart;
private List<ContainerBalancerTaskIterationStatusInfo> iterationsStatistic;

private OffsetDateTime currentIterationStarted;
juncevich marked this conversation as resolved.
Show resolved Hide resolved
juncevich marked this conversation as resolved.
Show resolved Hide resolved
/**
* Constructs ContainerBalancerTask with the specified arguments.
*
Expand Down Expand Up @@ -215,6 +217,7 @@ private void balance() {
// leader change or restart
int i = nextIterationIndex;
for (; i < iterations && isBalancerRunning(); i++) {
currentIterationStarted = now();
// reset some variables and metrics for this iteration
resetState();
if (config.getTriggerDuEnable()) {
Expand Down Expand Up @@ -262,7 +265,7 @@ private void balance() {
}

IterationResult iR = doIteration();
saveIterationStatistic(i, iR);
saveIterationStatistic(i + 1, iR);
juncevich marked this conversation as resolved.
Show resolved Hide resolved
metrics.incrementNumIterations(1);

LOG.info("Result of this iteration of Container Balancer: {}", iR);
Expand Down Expand Up @@ -310,8 +313,9 @@ private void saveIterationStatistic(Integer iterationNumber, IterationResult iR)
ContainerBalancerTaskIterationStatusInfo iterationStatistic = new ContainerBalancerTaskIterationStatusInfo(
iterationNumber,
iR.name(),
getSizeScheduledForMoveInLatestIteration() / OzoneConsts.GB,
metrics.getDataSizeMovedGBInLatestIteration(),
now().toEpochSecond() - currentIterationStarted.toEpochSecond(),
juncevich marked this conversation as resolved.
Show resolved Hide resolved
getSizeScheduledForMoveInLatestIteration(),
metrics.getDataSizeMovedInLatestIteration(),
metrics.getNumContainerMovesScheduledInLatestIteration(),
metrics.getNumContainerMovesCompletedInLatestIteration(),
metrics.getNumContainerMovesFailedInLatestIteration(),
Expand All @@ -324,7 +328,7 @@ private void saveIterationStatistic(Integer iterationNumber, IterationResult iR)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
Map.Entry::getValue
juncevich marked this conversation as resolved.
Show resolved Hide resolved
)
),
findSourceStrategy.getSizeLeavingNodes()
Expand All @@ -335,25 +339,23 @@ private void saveIterationStatistic(Integer iterationNumber, IterationResult iR)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
entry -> entry.getValue()
juncevich marked this conversation as resolved.
Show resolved Hide resolved
)
)
);
iterationsStatistic.add(iterationStatistic);
}

public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatistic() {

int lastIterationNumber = iterationsStatistic.stream()
.mapToInt(ContainerBalancerTaskIterationStatusInfo::getIterationNumber)
.max()
.orElse(0);

int lastIterationNumber = iterationsStatistic.isEmpty() ? 0 :
juncevich marked this conversation as resolved.
Show resolved Hide resolved
iterationsStatistic.get(iterationsStatistic.size() - 1).getIterationNumber();
long iterationDuration = getCurrentIterationDuration();
ContainerBalancerTaskIterationStatusInfo currentIterationStatistic = new ContainerBalancerTaskIterationStatusInfo(
lastIterationNumber + 1,
lastIterationNumber,
null,
juncevich marked this conversation as resolved.
Show resolved Hide resolved
getSizeScheduledForMoveInLatestIteration() / OzoneConsts.GB,
sizeActuallyMovedInLatestIteration / OzoneConsts.GB,
iterationDuration,
getSizeScheduledForMoveInLatestIteration(),
sizeActuallyMovedInLatestIteration,
metrics.getNumContainerMovesScheduledInLatestIteration(),
metrics.getNumContainerMovesCompletedInLatestIteration(),
metrics.getNumContainerMovesFailedInLatestIteration(),
Expand All @@ -365,7 +367,7 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
entry -> entry.getValue()
)
),
findSourceStrategy.getSizeLeavingNodes()
juncevich marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -376,7 +378,7 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
entry -> entry.getValue()
)
)
);
Expand All @@ -385,6 +387,14 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
return resultList;
}

private long getCurrentIterationDuration() {
if (currentIterationStarted == null) {
return -1L;
juncevich marked this conversation as resolved.
Show resolved Hide resolved
} else {
return now().toEpochSecond() - currentIterationStarted.toEpochSecond();
}
}

/**
* Logs the reason for stop and save configuration and stop the task.
*
Expand Down Expand Up @@ -720,6 +730,8 @@ private void checkIterationMoveResults() {
metrics.getNumContainerMovesTimeoutInLatestIteration());
metrics.incrementDataSizeMovedGBInLatestIteration(
sizeActuallyMovedInLatestIteration / OzoneConsts.GB);
metrics.incrementDataSizeMovedInLatestIteration(
juncevich marked this conversation as resolved.
Show resolved Hide resolved
sizeActuallyMovedInLatestIteration);
metrics.incrementDataSizeMovedGB(
metrics.getDataSizeMovedGBInLatestIteration());
metrics.incrementNumContainerMovesFailed(
Expand Down Expand Up @@ -1146,6 +1158,7 @@ private void resetState() {
this.sizeScheduledForMoveInLatestIteration = 0;
this.sizeActuallyMovedInLatestIteration = 0;
metrics.resetDataSizeMovedGBInLatestIteration();
metrics.resetDataSizeMovedInLatestIteration();
metrics.resetNumContainerMovesScheduledInLatestIteration();
metrics.resetNumContainerMovesCompletedInLatestIteration();
metrics.resetNumContainerMovesTimeoutInLatestIteration();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,40 @@
public class ContainerBalancerTaskIterationStatusInfo {
private final Integer iterationNumber;
private final String iterationResult;
private final long sizeScheduledForMoveGB;
private final long dataSizeMovedGB;
private final long iterationDuration;
private final long sizeScheduledForMove;
private final long dataSizeMoved;
private final long containerMovesScheduled;
private final long containerMovesCompleted;
private final long containerMovesFailed;
private final long containerMovesTimeout;
private final Map<UUID, Long> sizeEnteringNodesGB;
private final Map<UUID, Long> sizeLeavingNodesGB;
private final Map<UUID, Long> sizeEnteringNodes;
private final Map<UUID, Long> sizeLeavingNodes;

@SuppressWarnings("checkstyle:ParameterNumber")
public ContainerBalancerTaskIterationStatusInfo(
juncevich marked this conversation as resolved.
Show resolved Hide resolved
Integer iterationNumber,
String iterationResult,
long sizeScheduledForMoveGB,
long dataSizeMovedGB,
long iterationDuration,
long sizeScheduledForMove,
long dataSizeMoved,
long containerMovesScheduled,
long containerMovesCompleted,
long containerMovesFailed,
long containerMovesTimeout,
Map<UUID, Long> sizeEnteringNodesGB,
Map<UUID, Long> sizeLeavingNodesGB) {
Map<UUID, Long> sizeEnteringNodes,
Map<UUID, Long> sizeLeavingNodes) {
this.iterationNumber = iterationNumber;
this.iterationResult = iterationResult;
this.sizeScheduledForMoveGB = sizeScheduledForMoveGB;
this.dataSizeMovedGB = dataSizeMovedGB;
this.iterationDuration = iterationDuration;
this.sizeScheduledForMove = sizeScheduledForMove;
this.dataSizeMoved = dataSizeMoved;
this.containerMovesScheduled = containerMovesScheduled;
this.containerMovesCompleted = containerMovesCompleted;
this.containerMovesFailed = containerMovesFailed;
this.containerMovesTimeout = containerMovesTimeout;
this.sizeEnteringNodesGB = sizeEnteringNodesGB;
this.sizeLeavingNodesGB = sizeLeavingNodesGB;
this.sizeEnteringNodes = sizeEnteringNodes;
this.sizeLeavingNodes = sizeLeavingNodes;
}

public Integer getIterationNumber() {
Expand All @@ -68,12 +71,12 @@ public String getIterationResult() {
return iterationResult;
}

public long getSizeScheduledForMoveGB() {
return sizeScheduledForMoveGB;
public long getSizeScheduledForMove() {
return sizeScheduledForMove;
}

public long getDataSizeMovedGB() {
return dataSizeMovedGB;
public long getDataSizeMoved() {
return dataSizeMoved;
}

public long getContainerMovesScheduled() {
Expand All @@ -92,12 +95,16 @@ public long getContainerMovesTimeout() {
return containerMovesTimeout;
}

public Map<UUID, Long> getSizeEnteringNodesGB() {
return sizeEnteringNodesGB;
public Map<UUID, Long> getSizeEnteringNodes() {
return sizeEnteringNodes;
}

public Map<UUID, Long> getSizeLeavingNodesGB() {
return sizeLeavingNodesGB;
public Map<UUID, Long> getSizeLeavingNodes() {
return sizeLeavingNodes;
}

public long getIterationDuration() {
return iterationDuration;
}
}

Expand Down
Loading