diff --git a/dash-pipeline/SAI/SAI b/dash-pipeline/SAI/SAI index c93e463f6..3fe490d83 160000 --- a/dash-pipeline/SAI/SAI +++ b/dash-pipeline/SAI/SAI @@ -1 +1 @@ -Subproject commit c93e463f643c55d47f14d26acdbc4d18d682f932 +Subproject commit 3fe490d836be6ed1d7c4a8763d5e33053f9e957a diff --git a/dash-pipeline/SAI/sai_api_gen.py b/dash-pipeline/SAI/sai_api_gen.py index ceb574d42..3ea28abf1 100755 --- a/dash-pipeline/SAI/sai_api_gen.py +++ b/dash-pipeline/SAI/sai_api_gen.py @@ -21,7 +21,6 @@ print("Import failed for " + ie.name) exit(1) - if __name__ == "__main__": # CLI parser = argparse.ArgumentParser(description="P4 SAI API generator") diff --git a/dash-pipeline/SAI/src/utils.h b/dash-pipeline/SAI/src/utils.h index b91e3da2b..c61ccde3b 100644 --- a/dash-pipeline/SAI/src/utils.h +++ b/dash-pipeline/SAI/src/utils.h @@ -220,6 +220,30 @@ namespace dash ipaddrSetMask(value.ipaddr, t); } + template + void ip4SetVal(const sai_ip4_t &value, T &t, int bits = -1) + { + t->set_value(&value, 4); + } + + template + void ip4SetVal(const sai_attribute_value_t &value, T &t, int bits = -1) + { + t->set_value(&value.ip4, 4); + } + + template + void ip6SetVal(const sai_ip6_t &value, T &t, int bits = -1) + { + t->set_value(const_cast(&value[0]), 16); + } + + template + void ip6SetVal(const sai_attribute_value_t &value, T &t, int bits = -1) + { + t->set_value(const_cast(&value.ip6[0]), 16); + } + template void macSetVal(const sai_attribute_value_t &value, T &t, int bits = -1) { diff --git a/dash-pipeline/SAI/utils/dash_p4/sai_type_solver.py b/dash-pipeline/SAI/utils/dash_p4/sai_type_solver.py index 6f3b0a3a2..005b9a2e0 100644 --- a/dash-pipeline/SAI/utils/dash_p4/sai_type_solver.py +++ b/dash-pipeline/SAI/utils/dash_p4/sai_type_solver.py @@ -42,6 +42,8 @@ class SAITypeSolver: "sai_ipaddr_range_list_t": SAITypeInfo( "sai_ipaddr_range_list_t", "ipaddrrangelist", default="empty" ), + "sai_ip4_t": SAITypeInfo("sai_ip4_t", "ip4", default="0.0.0.0"), + "sai_ip6_t": SAITypeInfo("sai_ip6_t", "ip6", default="0:0:0:0:0:0:0:0"), } @staticmethod diff --git a/dash-pipeline/bmv2/dash_metadata.p4 b/dash-pipeline/bmv2/dash_metadata.p4 index 114f493d2..2759f287b 100644 --- a/dash-pipeline/bmv2/dash_metadata.p4 +++ b/dash-pipeline/bmv2/dash_metadata.p4 @@ -114,6 +114,31 @@ enum bit<8> dash_ha_role_t { SWITCHING_TO_ACTIVE = 4 }; +// HA states +enum bit<8> dash_ha_state_t { + DEAD = 0, + // trying to connect to HA pair + CONNECTING = 1, + // bulk sync in progress + CONNECTED = 2, + // ready to be in STANDALONE state waiting for activation of admin role + INITIALIZING_TO_STANDALONE = 3, + // ready to be in ACTIVE state waiting for activation of admin role + INITIALIZING_TO_ACTIVE = 4, + // ready to be in STANDBY state waiting for activation of admin role + INITIALIZING_TO_STANDBY = 5, + // activation done, fowarding traffic + STANDALONE = 6, + // activation done, fowarding traffic and syncing flows with HA pair + ACTIVE = 7, + // activation done, ready to fowarding traffic if pair fails + STANDBY = 8, + // planned shutdown in progress + DESTROYING = 9, + // gracefully transitioning from paired state to stand-alone + SWITCHING_TO_STANDALONE = 10 +}; + // Flow sync state enum bit<8> dash_ha_flow_sync_state_t { FLOW_MISS = 0, // Flow not created yet @@ -190,7 +215,7 @@ struct metadata_t { // Actions bit<32> routing_actions; - + // Action data bool dropped; // encap_data is for underlay diff --git a/dash-pipeline/bmv2/dash_pipeline.p4 b/dash-pipeline/bmv2/dash_pipeline.p4 index 6ca1b0f4f..1bdaa4fe3 100644 --- a/dash-pipeline/bmv2/dash_pipeline.p4 +++ b/dash-pipeline/bmv2/dash_pipeline.p4 @@ -112,7 +112,8 @@ control dash_ingress( bit<1> disable_fast_path_icmp_flow_redirection, bit<1> full_flow_resimulation_requested, bit<64> max_resimulated_flow_per_second, - @SaiVal[type="sai_object_id_t"] bit<16> routing_group_id) + @SaiVal[type="sai_object_id_t"] bit<16> routing_group_id, + bit<1> is_ha_flow_owner) { meta.eni_data.cps = cps; meta.eni_data.pps = pps; @@ -335,7 +336,7 @@ control dash_ingress( if (meta.eni_data.admin_state == 0) { deny(); } - + UPDATE_ENI_COUNTER(eni_rx); if (meta.is_fast_path_icmp_flow_redirection_packet) { UPDATE_ENI_COUNTER(eni_lb_fast_path_icmp_in); @@ -372,7 +373,7 @@ control dash_ingress( #endif // TARGET_BMV2_V1MODEL #ifdef TARGET_DPDK_PNA , istd - #endif // TARGET_DPDK_PNA + #endif // TARGET_DPDK_PNA ); if (meta.eni_data.dscp_mode == dash_tunnel_dscp_mode_t.PIPE_MODEL) { diff --git a/dash-pipeline/bmv2/stages/ha.p4 b/dash-pipeline/bmv2/stages/ha.p4 index 16bc0c6fa..0eba94db2 100644 --- a/dash-pipeline/bmv2/stages/ha.p4 +++ b/dash-pipeline/bmv2/stages/ha.p4 @@ -12,7 +12,12 @@ control ha_stage(inout headers_t hdr, @SaiVal[type="sai_dash_ha_role_t"] dash_ha_role_t dash_ha_role, @SaiVal[isreadonly="true"] bit<32> flow_version, bit<1> flow_reconcile_requested, - @SaiVal[isreadonly="true"] bit<1> flow_reconcile_needed + @SaiVal[isreadonly="true"] bit<1> flow_reconcile_needed, + // TODO: vip_v6 requires changes in SAI meta to support sai_ip6_t value + @SaiVal[type="sai_ip4_t"] IPv4Address vip_v4, + bit<1> admin_state, + @SaiVal[isreadonly="true", type="sai_dash_ha_state_t"] dash_ha_state_t dash_ha_state, + bit<1> activate_role ) { meta.ha.ha_set_id = ha_set_id; meta.ha.ha_role = dash_ha_role; @@ -65,11 +70,12 @@ control ha_stage(inout headers_t hdr, bit<16> dp_channel_max_src_port, bit<32> dp_channel_probe_interval_ms, bit<32> dp_channel_probe_fail_threshold, - @SaiVal[isreadonly="true"] bit<1> dp_channel_is_alive + @SaiVal[isreadonly="true"] bit<1> dp_channel_is_alive, + bit<32> switchover_convergence_timeout_ms ) { meta.ha.peer_ip_is_v6 = peer_ip_is_v6; meta.ha.peer_ip = peer_ip; - + meta.ha.dp_channel_dst_port = dp_channel_dst_port; meta.ha.dp_channel_src_port_min = dp_channel_min_src_port; meta.ha.dp_channel_src_port_max = dp_channel_max_src_port; @@ -97,7 +103,7 @@ control ha_stage(inout headers_t hdr, return; } ha_set.apply(); - + // TODO: HA state machine handling. } } diff --git a/documentation/high-avail/ha-api-hld.md b/documentation/high-avail/ha-api-hld.md index f76b470f1..75cee0924 100644 --- a/documentation/high-avail/ha-api-hld.md +++ b/documentation/high-avail/ha-api-hld.md @@ -8,6 +8,7 @@ | 0.4 | 04/01/2024 | Riff Jiang | Added capabilities for HA owner, simplified capabilities for HA topology. | | 0.5 | 04/08/2024 | Riff Jiang | Added support for bulk sync. | | 0.6 | 04/09/2024 | Riff Jiang | Added support for flow reconcile for planned and unplanned switchover. | +| 0.7 | 05/20/2024 | Mukesh MV | Added DPU scope DPU driven attributes. | 1. [1. Terminology](#1-terminology) 2. [2. Background](#2-background) @@ -60,6 +61,8 @@ The DASH high availability APIs are a set of APIs to support flow HA feature for DASH. It follows the [SmartSwitch high availability design](https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/high-availability/smart-switch-ha-hld.md) and used to ensure the flow created on the active DPU can be correctly synchronized to the peered DPU. +It also supports the DPU-Scope-DPU-Driven setup in which the DPU internally owns the HA state machine. + For how the network topology is setup and how flow HA works, such as lifetime management, inline sync, bulk sync, and packet format, please refer to the [SmartSwitch high availability design](https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/high-availability/smart-switch-ha-hld.md). In this doc, we will only focus on the design from SAI API perspective. ## 3. Overview @@ -70,7 +73,7 @@ To support the [SmartSwitch HA workflows](https://github.com/sonic-net/SONiC/blo - **HA scope**: It controls the failover scope, such as HA role, such as active, standby or standalone, and expected flow version for new flows. Depends on the HA role of the ENI, the packet will be processed differently to get the flow synched. - **Flow table**: It is the container of all flow entries. It can be attached to all ENIs in a DPU or being attached to a single DPU, depends on at which level we like to provide the flow HA, i.e. HA scope. - **Flow**: It is used to represent a network connection, which contains match conditions and packet transformations. In HA, each flow will have it own HA-related states, such flow version, flow sync state and etc. -- **ENI**: In ENI-level HA, each ENI will be connected to a HA scope. +- **ENI**: In ENI-level HA, each ENI will be connected to a different HA scope. In DPU scope HA, all ENIs or a group of ENIs will be associated with a HA scope. The components is designed to be conceptually simple and reusable, hence we can use these components to support different HA setup. For example, to support the current ENI-level HA design, these components can be put together as below: @@ -105,6 +108,7 @@ HA set is defined as a SAI object and contains the following SAI attributes: | SAI_HA_SET_ATTR_DP_CHANNEL_PROBE_INTERVAL_MS | `sai_uint32_t` | The interval of the data plane channel probe. | | SAI_HA_SET_ATTR_DP_CHANNEL_PROBE_FAIL_THRESHOLD | `sai_uint32_t` | The threshold of the data plane channel probe fail. | | SAI_HA_SET_ATTR_DP_CHANNEL_IS_ALIVE | `bool` | (Read-only) Is data plane channel alive. | +| SAI_HA_SET_ATTR_SWITCHOVER_CONVERGENCE_TIMEOUT_MS | sai_uint32_t | Time to wait for the network to switchover during planned shutdown used in the case of DPU driven state machine. | ### 4.2. HA Scope @@ -117,6 +121,11 @@ HA scope is also defined as a SAI object and contains the following SAI attribut | SAI_HA_SCOPE_ATTR_FLOW_VERSION | `sai_uint32_t` | The flow version for new flows. | | SAI_HA_SCOPE_ATTR_FLOW_RECONCILE_REQUESTED | `bool` | When set to true, flow reconcile will be initiated. | | SAI_HA_SCOPE_ATTR_FLOW_RECONCILE_NEEDED | `bool` | (Read-only) If true, flow reconcile is needed. | +| SAI_HA_SCOPE_ATTR_VIP_V4 | `sai_ipaddress_t` | Dedicated IPv4 VIP for DPU HA scope. | +| SAI_HA_SCOPE_ATTR_VIP_V6 | `sai_ipaddress_t` | Dedicated IPv6 VIP for DPU HA scope. | +| SAI_HA_SCOPE_ATTR_ADMIN_STATE | `bool` | Start or stop the DPU driven HA state machine. | +| SAI_HA_SCOPE_ATTR_HA_STATE | `sai_dash_ha_state_t` | Read-only state in case of DPU driven state machine. | +| SAI_HA_SCOPE_ATTR_ACTIVATE_ROLE | `bool` | Trigger DPU driven HA state machine to enable BFD towards NPUs in order to start receiving traffic destined to VIP. | The HA role is defined as below: @@ -131,6 +140,25 @@ typedef enum _sai_dash_ha_role_t } sai_dash_ha_role_t; ``` +The read-only HA state for DPU driven HA state machine is defined as below: + +```c +typedef enum _sai_dash_ha_state_t +{ + SAI_DASH_HA_STATE_DEAD, + SAI_DASH_HA_STATE_CONNECTING, + SAI_DASH_HA_STATE_CONNECTED, + SAI_DASH_HA_STATE_INITIALIZING_TO_STANDALONE, + SAI_DASH_HA_STATE_INITIALIZING_TO_ACTIVE, + SAI_DASH_HA_STATE_INITIALIZING_TO_STANDBY, + SAI_DASH_HA_STATE_STANDALONE, + SAI_DASH_HA_STATE_ACTIVE, + SAI_DASH_HA_STATE_STANDBY, + SAI_DASH_HA_STATE_DESTROYING, + SAI_DASH_HA_STATE_SWITCHING_TO_STANDALONE, +} sai_dash_ha_state_t; +``` + ### 4.3. Flow table HA uses the DASH flow table to achieve the flow state manipulation. Since the flow table already provides the CRUD operations, we don't need any extra APIs from flow table. @@ -192,6 +220,7 @@ To provide the ENI-level HA control, each ENI will have the following SAI attrib | Attribute name | Type | Description | | -------------- | ---- | ----------- | | SAI_ENI_ATTR_HA_SCOPE_ID | `sai_object_id_t` | The HA scope ID of the ENI. | +| SAI_ENI_ATTR_IS_HA_FLOW_OWNER | `bool` | Determines which DPU in the pair creates flows belonging to this ENI in steady-state. Typically this is True for the Active DPU and False for the Standby DPU. | ### 4.6. Event notifications @@ -675,3 +704,32 @@ sequenceDiagram Note over S0N,S1N: hamgrd continue to drive HA
state machine and update
nexthop on all switches. ``` + +### 6.2. DPU scope DPU driven HA + +In this mode, the DPU owns the HA state machine and drives it based on inputs from the SAI API. The workflows are defined in the DPU scope DPU driven document. + +#### 6.2.1. HA set creation + +The HA bring-up workflow is described below: +- The DPU starts out with it's initial HA scope Role as Dead. +- First the SDN controller pushes all configurations including the HA set and the HA scope with role set to Active/Standby but AdminState set to Disabled. +- Then the SDN controller starts the HA state-machine on the DPU by updating the HA scope AdminState to Enabled. +- DPU HA state transitions to Connecting and attempts to connect to its pair specified in the HA set. +- If the connection attempt is unsuccessful it moves to the InitializingToStandalone state and waits for Activate-Role trigger from SDN controller. +- If the connection was successful then the DPU HA state transitions to Connected state and performs bulk sync to synchronize existing flows from the other DPU in the pair in case it is already Active. +- At the end of the bulk sync the HA state transitions to InitializingToActive and waits for Activate-Role trigger from SDN controller. +- At this point the DPU is fully synchronized with the pair and is ready to receive any traffic. But the DPU has not enabled its BFD session to the NPUs, so it will not recieve any data traffic yet. +- Whenever the SDN controller decides this DPU is ready to be online, it pushes the Activate-Role trigger. +- DPU enables BFD to all the NPUs and the HA state transitions to Standalone or Active/Standby depending on whether it was able to connect to the paired DPU. + + +#### 6.2.2 Planned shutdown + +The workflow for planned shutdown is as follows: + +- Taking the case where DPUs 0 and 1 are already in Active state in the HA set. SDN controller triggers DPU 0 to gracefully unpair from HA by updating its HA role to Dead. +- DPU 0 HA state transitions to Destroying and brings down its BFD session to all NPUs. This causes all NPUs to select DPU 1 as Active and send all traffic to DPU 1. +- DPU 1 HA state transitions to SwitchingToStandalone as it prepares to become Standalone. +- DPU 0 starts a configurable timer to wait for the network to switchover and at the end of it shuts down HA. +- DPU 1 will not resimulate any flows synced from DPU 0 until the SDN controller pushes FlowReconcile trigger. This is to ensure that the synced flows are not disturbed by mistake in case DPU 1 is still catching up to DPU 0's final config state.