Skip to content

Commit

Permalink
HDDS-11463. Track and display failed DataNode storage locations in SCM.
Browse files Browse the repository at this point in the history
  • Loading branch information
fanshilun committed Nov 11, 2024
1 parent 2797c45 commit 1c16cb2
Show file tree
Hide file tree
Showing 16 changed files with 560 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ public final class HddsConfigKeys {
public static final double
HDDS_SCM_SAFEMODE_ONE_NODE_REPORTED_PIPELINE_PCT_DEFAULT = 0.90;

public static final String HDDS_SCM_SAFEMODE_REPORTED_DATANODE_PCT =
"hdds.scm.safemode.reported.datanode.pct";
public static final double HDDS_SCM_SAFEMODE_REPORTED_DATANODE_PCT_DEFAULT = 0.10;

// This configuration setting is used as a fallback location by all
// Ozone/HDDS services for their metadata. It is useful as a single
// config point for test/PoC clusters.
Expand Down
9 changes: 9 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1702,6 +1702,15 @@
</description>
</property>

<property>
<name>hdds.scm.safemode.reported.datanode.pct</name>
<value>0.10</value>
<tag>HDDS,SCM,OPERATION</tag>
<description>
Percentage of successfully reported datanodes.
</description>
</property>

<property>
<name>hdds.container.action.max.limit</name>
<value>20</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
.StorageContainerDatanodeProtocolProtos.ContainerReportsProto;
import org.apache.hadoop.hdds.scm.ScmConfig;
import org.apache.hadoop.hdds.scm.container.report.ContainerReportValidator;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher
.ContainerReportFromDatanode;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer;
import org.apache.hadoop.hdds.server.events.EventHandler;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
Expand Down Expand Up @@ -199,6 +201,11 @@ public void onMessage(final ContainerReportFromDatanode reportFromDatanode,
// list
processMissingReplicas(datanodeDetails, expectedContainersInDatanode);
containerManager.notifyContainerReportProcessing(true, true);
if (reportFromDatanode.isRegister()) {
publisher.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
new SCMDatanodeProtocolServer.NodeRegistrationContainerReport(datanodeDetails,
reportFromDatanode.getReport()));
}
}
} catch (NodeNotFoundException ex) {
containerManager.notifyContainerReportProcessing(true, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ public final class SCMEvents {
NodeRegistrationContainerReport.class,
"Node_Registration_Container_Report");

/**
* Event generated on DataNode Registration Container Report.
*/
public static final TypedEvent<NodeRegistrationContainerReport>
CONTAINER_REGISTRATION_REPORT = new TypedEvent<>(
NodeRegistrationContainerReport.class, "Container_Registration_Report");

/**
* ContainerReports are sent out by Datanodes. This report is received by
* SCMDatanodeHeartbeatDispatcher and Container_Report Event is generated.
Expand Down
Loading

0 comments on commit 1c16cb2

Please sign in to comment.