Skip to content

Commit

Permalink
Add support for DPU driven DPU scope HA attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
mukeshmv committed Jun 17, 2024
1 parent 235e43b commit 21e911c
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 11 deletions.
2 changes: 1 addition & 1 deletion dash-pipeline/SAI/SAI
Submodule SAI updated 44 files
+2 −2 azure-pipelines.yml
+0 −116 doc/ARS/Adaptive-Routing-and-Switching.md
+107 −67 doc/SAI-Extensions.md
+0 −591 doc/SAI-Proposal-PoE.md
+0 −438 doc/TAM/SAI-Proposal-TAM-MOD-Localhost.md
+ doc/behavioral model/pipeline_v11.vsdx
+ doc/figures/PoE_Physical_Design.png
+0 −29 doc/tunnel/Tunnel-decap.md
+2 −4 experimental/saiexperimentalbmtor.h
+2 −2 experimental/saiexperimentaldashacl.h
+2 −4 experimental/saiexperimentaldashdirectionlookup.h
+9 −253 experimental/saiexperimentaldasheni.h
+0 −393 experimental/saiexperimentaldashha.h
+5 −31 experimental/saiexperimentaldashinboundrouting.h
+22 −70 experimental/saiexperimentaldashmeter.h
+9 −20 experimental/saiexperimentaldashoutboundcatopa.h
+10 −21 experimental/saiexperimentaldashoutboundrouting.h
+2 −4 experimental/saiexperimentaldashpavalidation.h
+2 −4 experimental/saiexperimentaldashvip.h
+2 −4 experimental/saiexperimentaldashvnet.h
+0 −3 experimental/saiextensions.h
+0 −133 experimental/saiswitchextensions.h
+0 −37 experimental/saitypesextensions.h
+0 −2 inc/sai.h
+2 −20 inc/saiacl.h
+0 −117 inc/saiarsprofile.h
+0 −3 inc/saihostif.h
+0 −522 inc/saipoe.h
+1 −33 inc/saiport.h
+0 −3 inc/saiqueue.h
+0 −21 inc/saiswitch.h
+1 −25 inc/saitam.h
+5 −8 inc/saitunnel.h
+0 −94 inc/saitypes.h
+1 −1 inc/saiversion.h
+0 −9 meta/acronyms.txt
+4 −5 meta/parse.pl
+0 −5 meta/saimetadatatypes.h
+0 −5 meta/saisanitycheck.c
+1 −2 meta/structs.pl
+18 −25 meta/templates/sai_adapter.py.tt
+0 −3 meta/test.pm
+7 −7 ptf/saiacl.py
+1 −1 ptf/saitunnel.py
1 change: 0 additions & 1 deletion dash-pipeline/SAI/sai_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
print("Import failed for " + ie.name)
exit(1)


if __name__ == "__main__":
# CLI
parser = argparse.ArgumentParser(description="P4 SAI API generator")
Expand Down
24 changes: 24 additions & 0 deletions dash-pipeline/SAI/src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,30 @@ namespace dash
ipaddrSetMask(value.ipaddr, t);
}

template<typename T>
void ip4SetVal(const sai_ip4_t &value, T &t, int bits = -1)
{
t->set_value(&value, 4);
}

template<typename T>
void ip4SetVal(const sai_attribute_value_t &value, T &t, int bits = -1)
{
t->set_value(&value.ip4, 4);
}

template<typename T>
void ip6SetVal(const sai_ip6_t &value, T &t, int bits = -1)
{
t->set_value(const_cast<uint8_t*>(&value[0]), 16);
}

template<typename T>
void ip6SetVal(const sai_attribute_value_t &value, T &t, int bits = -1)
{
t->set_value(const_cast<uint8_t*>(&value.ip6[0]), 16);
}

template<typename T>
void macSetVal(const sai_attribute_value_t &value, T &t, int bits = -1)
{
Expand Down
2 changes: 2 additions & 0 deletions dash-pipeline/SAI/utils/dash_p4/sai_type_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class SAITypeSolver:
"sai_ipaddr_range_list_t": SAITypeInfo(
"sai_ipaddr_range_list_t", "ipaddrrangelist", default="empty"
),
"sai_ip4_t": SAITypeInfo("sai_ip4_t", "ip4", default="0.0.0.0"),
"sai_ip6_t": SAITypeInfo("sai_ip6_t", "ip6", default="0:0:0:0:0:0:0:0"),
}

@staticmethod
Expand Down
27 changes: 26 additions & 1 deletion dash-pipeline/bmv2/dash_metadata.p4
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,31 @@ enum bit<8> dash_ha_role_t {
SWITCHING_TO_ACTIVE = 4
};

// HA states
enum bit<8> dash_ha_state_t {
DEAD = 0,
// trying to connect to HA pair
CONNECTING = 1,
// bulk sync in progress
CONNECTED = 2,
// ready to be in STANDALONE state waiting for activation of admin role
INITIALIZING_TO_STANDALONE = 3,
// ready to be in ACTIVE state waiting for activation of admin role
INITIALIZING_TO_ACTIVE = 4,
// ready to be in STANDBY state waiting for activation of admin role
INITIALIZING_TO_STANDBY = 5,
// activation done, fowarding traffic
STANDALONE = 6,
// activation done, fowarding traffic and syncing flows with HA pair
ACTIVE = 7,
// activation done, ready to fowarding traffic if pair fails
STANDBY = 8,
// planned shutdown in progress
DESTROYING = 9,
// gracefully transitioning from paired state to stand-alone
SWITCHING_TO_STANDALONE = 10
};

// Flow sync state
enum bit<8> dash_ha_flow_sync_state_t {
FLOW_MISS = 0, // Flow not created yet
Expand Down Expand Up @@ -190,7 +215,7 @@ struct metadata_t {

// Actions
bit<32> routing_actions;

// Action data
bool dropped;
// encap_data is for underlay
Expand Down
7 changes: 4 additions & 3 deletions dash-pipeline/bmv2/dash_pipeline.p4
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ control dash_ingress(
bit<1> disable_fast_path_icmp_flow_redirection,
bit<1> full_flow_resimulation_requested,
bit<64> max_resimulated_flow_per_second,
@SaiVal[type="sai_object_id_t"] bit<16> routing_group_id)
@SaiVal[type="sai_object_id_t"] bit<16> routing_group_id,
bit<1> is_ha_flow_owner)
{
meta.eni_data.cps = cps;
meta.eni_data.pps = pps;
Expand Down Expand Up @@ -335,7 +336,7 @@ control dash_ingress(
if (meta.eni_data.admin_state == 0) {
deny();
}

UPDATE_ENI_COUNTER(eni_rx);
if (meta.is_fast_path_icmp_flow_redirection_packet) {
UPDATE_ENI_COUNTER(eni_lb_fast_path_icmp_in);
Expand Down Expand Up @@ -372,7 +373,7 @@ control dash_ingress(
#endif // TARGET_BMV2_V1MODEL
#ifdef TARGET_DPDK_PNA
, istd
#endif // TARGET_DPDK_PNA
#endif // TARGET_DPDK_PNA
);

if (meta.eni_data.dscp_mode == dash_tunnel_dscp_mode_t.PIPE_MODEL) {
Expand Down
14 changes: 10 additions & 4 deletions dash-pipeline/bmv2/stages/ha.p4
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ control ha_stage(inout headers_t hdr,
@SaiVal[type="sai_dash_ha_role_t"] dash_ha_role_t dash_ha_role,
@SaiVal[isreadonly="true"] bit<32> flow_version,
bit<1> flow_reconcile_requested,
@SaiVal[isreadonly="true"] bit<1> flow_reconcile_needed
@SaiVal[isreadonly="true"] bit<1> flow_reconcile_needed,
// TODO: vip_v6 requires changes in SAI meta to support sai_ip6_t value
@SaiVal[type="sai_ip4_t"] IPv4Address vip_v4,
bit<1> admin_state,
@SaiVal[isreadonly="true", type="sai_dash_ha_state_t"] dash_ha_state_t dash_ha_state,
bit<1> activate_role
) {
meta.ha.ha_set_id = ha_set_id;
meta.ha.ha_role = dash_ha_role;
Expand Down Expand Up @@ -65,11 +70,12 @@ control ha_stage(inout headers_t hdr,
bit<16> dp_channel_max_src_port,
bit<32> dp_channel_probe_interval_ms,
bit<32> dp_channel_probe_fail_threshold,
@SaiVal[isreadonly="true"] bit<1> dp_channel_is_alive
@SaiVal[isreadonly="true"] bit<1> dp_channel_is_alive,
bit<32> switchover_convergence_timeout_ms
) {
meta.ha.peer_ip_is_v6 = peer_ip_is_v6;
meta.ha.peer_ip = peer_ip;

meta.ha.dp_channel_dst_port = dp_channel_dst_port;
meta.ha.dp_channel_src_port_min = dp_channel_min_src_port;
meta.ha.dp_channel_src_port_max = dp_channel_max_src_port;
Expand Down Expand Up @@ -97,7 +103,7 @@ control ha_stage(inout headers_t hdr,
return;
}
ha_set.apply();

// TODO: HA state machine handling.
}
}
Expand Down
60 changes: 59 additions & 1 deletion documentation/high-avail/ha-api-hld.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
| 0.4 | 04/01/2024 | Riff Jiang | Added capabilities for HA owner, simplified capabilities for HA topology. |
| 0.5 | 04/08/2024 | Riff Jiang | Added support for bulk sync. |
| 0.6 | 04/09/2024 | Riff Jiang | Added support for flow reconcile for planned and unplanned switchover. |
| 0.7 | 05/20/2024 | Mukesh MV | Added DPU scope DPU driven attributes. |

1. [1. Terminology](#1-terminology)
2. [2. Background](#2-background)
Expand Down Expand Up @@ -60,6 +61,8 @@

The DASH high availability APIs are a set of APIs to support flow HA feature for DASH. It follows the [SmartSwitch high availability design](https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/high-availability/smart-switch-ha-hld.md) and used to ensure the flow created on the active DPU can be correctly synchronized to the peered DPU.

It also supports the DPU-Scope-DPU-Driven setup in which the DPU internally owns the HA state machine.

For how the network topology is setup and how flow HA works, such as lifetime management, inline sync, bulk sync, and packet format, please refer to the [SmartSwitch high availability design](https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/high-availability/smart-switch-ha-hld.md). In this doc, we will only focus on the design from SAI API perspective.

## 3. Overview
Expand All @@ -70,7 +73,7 @@ To support the [SmartSwitch HA workflows](https://github.com/sonic-net/SONiC/blo
- **HA scope**: It controls the failover scope, such as HA role, such as active, standby or standalone, and expected flow version for new flows. Depends on the HA role of the ENI, the packet will be processed differently to get the flow synched.
- **Flow table**: It is the container of all flow entries. It can be attached to all ENIs in a DPU or being attached to a single DPU, depends on at which level we like to provide the flow HA, i.e. HA scope.
- **Flow**: It is used to represent a network connection, which contains match conditions and packet transformations. In HA, each flow will have it own HA-related states, such flow version, flow sync state and etc.
- **ENI**: In ENI-level HA, each ENI will be connected to a HA scope.
- **ENI**: In ENI-level HA, each ENI will be connected to a different HA scope. In DPU scope HA, all ENIs or a group of ENIs will be associated with a HA scope.

The components is designed to be conceptually simple and reusable, hence we can use these components to support different HA setup. For example, to support the current ENI-level HA design, these components can be put together as below:

Expand Down Expand Up @@ -105,6 +108,7 @@ HA set is defined as a SAI object and contains the following SAI attributes:
| SAI_HA_SET_ATTR_DP_CHANNEL_PROBE_INTERVAL_MS | `sai_uint32_t` | The interval of the data plane channel probe. |
| SAI_HA_SET_ATTR_DP_CHANNEL_PROBE_FAIL_THRESHOLD | `sai_uint32_t` | The threshold of the data plane channel probe fail. |
| SAI_HA_SET_ATTR_DP_CHANNEL_IS_ALIVE | `bool` | (Read-only) Is data plane channel alive. |
| SAI_HA_SET_ATTR_SWITCHOVER_CONVERGENCE_TIMEOUT_MS | sai_uint32_t | Time to wait for the network to switchover during planned shutdown used in the case of DPU driven state machine. |

### 4.2. HA Scope

Expand All @@ -117,6 +121,11 @@ HA scope is also defined as a SAI object and contains the following SAI attribut
| SAI_HA_SCOPE_ATTR_FLOW_VERSION | `sai_uint32_t` | The flow version for new flows. |
| SAI_HA_SCOPE_ATTR_FLOW_RECONCILE_REQUESTED | `bool` | When set to true, flow reconcile will be initiated. |
| SAI_HA_SCOPE_ATTR_FLOW_RECONCILE_NEEDED | `bool` | (Read-only) If true, flow reconcile is needed. |
| SAI_HA_SCOPE_ATTR_VIP_V4 | `sai_ipaddress_t` | Dedicated IPv4 VIP for DPU HA scope. |
| SAI_HA_SCOPE_ATTR_VIP_V6 | `sai_ipaddress_t` | Dedicated IPv6 VIP for DPU HA scope. |
| SAI_HA_SCOPE_ATTR_ADMIN_STATE | `bool` | Start or stop the DPU driven HA state machine. |
| SAI_HA_SCOPE_ATTR_HA_STATE | `sai_dash_ha_state_t` | Read-only state in case of DPU driven state machine. |
| SAI_HA_SCOPE_ATTR_ACTIVATE_ROLE | `bool` | Trigger DPU driven HA state machine to enable BFD towards NPUs in order to start receiving traffic destined to VIP. |

The HA role is defined as below:

Expand All @@ -131,6 +140,25 @@ typedef enum _sai_dash_ha_role_t
} sai_dash_ha_role_t;
```

The read-only HA state for DPU driven HA state machine is defined as below:

```c
typedef enum _sai_dash_ha_state_t
{
SAI_DASH_HA_STATE_DEAD,
SAI_DASH_HA_STATE_CONNECTING,
SAI_DASH_HA_STATE_CONNECTED,
SAI_DASH_HA_STATE_INITIALIZING_TO_STANDALONE,
SAI_DASH_HA_STATE_INITIALIZING_TO_ACTIVE,
SAI_DASH_HA_STATE_INITIALIZING_TO_STANDBY,
SAI_DASH_HA_STATE_STANDALONE,
SAI_DASH_HA_STATE_ACTIVE,
SAI_DASH_HA_STATE_STANDBY,
SAI_DASH_HA_STATE_DESTROYING,
SAI_DASH_HA_STATE_SWITCHING_TO_STANDALONE,
} sai_dash_ha_state_t;
```

### 4.3. Flow table

HA uses the DASH flow table to achieve the flow state manipulation. Since the flow table already provides the CRUD operations, we don't need any extra APIs from flow table.
Expand Down Expand Up @@ -192,6 +220,7 @@ To provide the ENI-level HA control, each ENI will have the following SAI attrib
| Attribute name | Type | Description |
| -------------- | ---- | ----------- |
| SAI_ENI_ATTR_HA_SCOPE_ID | `sai_object_id_t` | The HA scope ID of the ENI. |
| SAI_ENI_ATTR_IS_HA_FLOW_OWNER | `bool` | Determines which DPU in the pair creates flows belonging to this ENI in steady-state. Typically this is True for the Active DPU and False for the Standby DPU. |

### 4.6. Event notifications

Expand Down Expand Up @@ -675,3 +704,32 @@ sequenceDiagram
Note over S0N,S1N: hamgrd continue to drive HA<br>state machine and update<br>nexthop on all switches.
```

### 6.2. DPU scope DPU driven HA

In this mode, the DPU owns the HA state machine and drives it based on inputs from the SAI API. The workflows are defined in the DPU scope DPU driven document.

#### 6.2.1. HA set creation

The HA bring-up workflow is described below:
- The DPU starts out with it's initial HA scope Role as Dead.
- First the SDN controller pushes all configurations including the HA set and the HA scope with role set to Active/Standby but AdminState set to Disabled.
- Then the SDN controller starts the HA state-machine on the DPU by updating the HA scope AdminState to Enabled.
- DPU HA state transitions to Connecting and attempts to connect to its pair specified in the HA set.
- If the connection attempt is unsuccessful it moves to the InitializingToStandalone state and waits for Activate-Role trigger from SDN controller.
- If the connection was successful then the DPU HA state transitions to Connected state and performs bulk sync to synchronize existing flows from the other DPU in the pair in case it is already Active.
- At the end of the bulk sync the HA state transitions to InitializingToActive and waits for Activate-Role trigger from SDN controller.
- At this point the DPU is fully synchronized with the pair and is ready to receive any traffic. But the DPU has not enabled its BFD session to the NPUs, so it will not recieve any data traffic yet.
- Whenever the SDN controller decides this DPU is ready to be online, it pushes the Activate-Role trigger.
- DPU enables BFD to all the NPUs and the HA state transitions to Standalone or Active/Standby depending on whether it was able to connect to the paired DPU.


#### 6.2.2 Planned shutdown

The workflow for planned shutdown is as follows:

- Taking the case where DPUs 0 and 1 are already in Active state in the HA set. SDN controller triggers DPU 0 to gracefully unpair from HA by updating its HA role to Dead.
- DPU 0 HA state transitions to Destroying and brings down its BFD session to all NPUs. This causes all NPUs to select DPU 1 as Active and send all traffic to DPU 1.
- DPU 1 HA state transitions to SwitchingToStandalone as it prepares to become Standalone.
- DPU 0 starts a configurable timer to wait for the network to switchover and at the end of it shuts down HA.
- DPU 1 will not resimulate any flows synced from DPU 0 until the SDN controller pushes FlowReconcile trigger. This is to ensure that the synced flows are not disturbed by mistake in case DPU 1 is still catching up to DPU 0's final config state.

0 comments on commit 21e911c

Please sign in to comment.