From 6eda798ac34e7a5393b21114777775eaaa7d3733 Mon Sep 17 00:00:00 2001 From: Umair Date: Thu, 2 Jan 2025 18:17:27 +0000 Subject: [PATCH] Add gatekeeper to manage sockets. Add apis to Open/Connect datagram sockets. Add Socket sanity test. --- .../perf_microbenchmark/CMakeLists.txt | 1 + .../tt_fabric_traffic_gen_rx_socket.cpp | 152 ++++ .../kernels/tt_fabric_traffic_gen_tx.cpp | 92 ++- .../tt_fabric_traffic_gen_tx_socket.cpp | 505 ++++++++++++++ .../test_tt_fabric_multi_hop_sanity.cpp | 66 +- .../routing/test_tt_fabric_socket_sanity.cpp | 648 ++++++++++++++++++ tt_fabric/hw/inc/routing_table.h | 1 + tt_fabric/hw/inc/tt_fabric.h | 629 +++++++++++++++-- tt_fabric/hw/inc/tt_fabric_api.h | 166 +++-- tt_fabric/hw/inc/tt_fabric_interface.h | 154 ++++- .../impl/kernels/tt_fabric_gatekeeper.cpp | 506 ++++++++++++++ tt_fabric/impl/kernels/tt_fabric_router.cpp | 63 +- 12 files changed, 2825 insertions(+), 158 deletions(-) create mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp create mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp create mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp create mode 100644 tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 972bc721995..31e3648d336 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -18,6 +18,7 @@ set(PERF_MICROBENCH_TESTS_SRCS routing/test_vc_loopback_tunnel.cpp routing/test_tt_fabric_sanity.cpp routing/test_tt_fabric_multi_hop_sanity.cpp + routing/test_tt_fabric_socket_sanity.cpp noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp old/matmul/matmul_global_l1.cpp old/matmul/matmul_local_l1.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp new file mode 100644 index 00000000000..ea42c57b436 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -0,0 +1,152 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "debug/dprint.h" +#include "dataflow_api.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +// clang-format on + +// seed to re-generate the data and validate against incoming data +constexpr uint32_t prng_seed = get_compile_time_arg_val(0); + +// total data/payload expected +constexpr uint32_t total_data_kb = get_compile_time_arg_val(1); +constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; + +// max packet size to generate mask +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(2); +static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); + +// fabric command +constexpr uint32_t test_command = get_compile_time_arg_val(3); + +// address to start reading from/poll on +constexpr uint32_t target_address = get_compile_time_arg_val(4); + +// atomic increment for the ATOMIC_INC command +constexpr uint32_t atomic_increment = get_compile_time_arg_val(5); + +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); +constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(8); +constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(9); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(10); +constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(11); +constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(12); +constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(13); + +volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = + reinterpret_cast(client_pull_req_buf_addr); +volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; +uint64_t xy_local_addr; +socket_reader_state socket_reader; + +tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); + +#define PAYLOAD_MASK (0xFFFF0000) + +void kernel_main() { + uint64_t processed_packet_words = 0, num_packets = 0; + volatile tt_l1_ptr uint32_t* poll_addr; + uint32_t poll_val = 0; + bool async_wr_check_failed = false; + + // parse runtime args + uint32_t dest_device = get_arg_val(0); + + tt_fabric_init(); + + zero_l1_buf(test_results, test_results_size_bytes); + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; + test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + zero_l1_buf( + reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); + test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); + test_results[PQ_TEST_MISC_INDEX] = 0xff000002; + zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); + test_results[PQ_TEST_MISC_INDEX] = 0xff000003; + + client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; + client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); + client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; + test_results[PQ_TEST_MISC_INDEX] = 0xff000004; + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(); + + socket_reader.init(data_buffer_start_addr, data_buffer_size_words); + DPRINT << "Socket open on " << dest_device << ENDL(); + test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + + fabric_socket_open( + 3, // the network plane to use for this socket + 2, // Temporal epoch for which the socket is being opened + 1, // Socket Id to open + SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket + SOCKET_DIRECTION_RECV, // Send or Receive + dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. + dest_device & 0xFFFF, + 0 // fabric virtual channel. + ); + test_results[PQ_TEST_MISC_INDEX] = 0xff000006; + + uint32_t loop_count = 0; + uint32_t packet_count = 0; + while (1) { + if (!fvc_req_buf_is_empty(client_pull_req_buf) && fvc_req_valid(client_pull_req_buf)) { + uint32_t req_index = client_pull_req_buf->rdptr.ptr & CHAN_REQ_BUF_SIZE_MASK; + chan_request_entry_t* req = (chan_request_entry_t*)client_pull_req_buf->chan_req + req_index; + pull_request_t* pull_req = &req->pull_request; + if (socket_reader.packet_in_progress == 0) { + DPRINT << "Socket Packet " << packet_count << ENDL(); + } + if (pull_req->flags == FORWARD) { + socket_reader.pull_socket_data(pull_req); + test_results[PQ_TEST_MISC_INDEX] = 0xDD000001; + noc_async_read_barrier(); + update_pull_request_words_cleared(pull_req); + socket_reader.pull_words_in_flight = 0; + socket_reader.push_socket_data(); + } + + if (socket_reader.packet_in_progress == 1 and socket_reader.packet_words_remaining == 0) { + // wait for any pending sockat data writes to finish. + test_results[PQ_TEST_MISC_INDEX] = 0xDD000002; + + noc_async_write_barrier(); + + test_results[PQ_TEST_MISC_INDEX] = 0xDD000003; + // clear the flags field to invalidate pull request slot. + // flags will be set to non-zero by next requestor. + req_buf_advance_rdptr((chan_req_buf*)client_pull_req_buf); + socket_reader.packet_in_progress = 0; + packet_count++; + loop_count = 0; + } + } + test_results[PQ_TEST_MISC_INDEX] = 0xDD400000 | (loop_count & 0xfffff); + + loop_count++; + if (packet_count > 0 and loop_count >= 0x10000) { + DPRINT << "Socket Rx Finished" << packet_count << ENDL(); + break; + } + } + + // write out results + set_64b_result(test_results, processed_packet_words, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); + + if (async_wr_check_failed) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; + } else { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; + test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + } +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index be7520fd4b7..578a0f7861e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -8,6 +8,7 @@ #include "tt_fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" #include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" // clang-format on uint32_t src_endpoint_id; @@ -20,8 +21,8 @@ constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); -// constexpr uint32_t router_x = get_compile_time_arg_val(6); -// constexpr uint32_t router_y = get_compile_time_arg_val(7); +constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(6); +constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(7); constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); @@ -55,6 +56,7 @@ constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); uint32_t dest_device; constexpr uint32_t signal_address = get_compile_time_arg_val(21); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(22); uint32_t max_packet_size_mask; @@ -62,8 +64,10 @@ auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); tt_l1_ptr volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table = reinterpret_cast(routing_table_start_addr); +volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; fvc_producer_state_t test_producer __attribute__((aligned(16))); +fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); uint64_t xy_local_addr; @@ -243,11 +247,82 @@ inline bool test_buffer_handler_atomic_inc() { return false; } +inline bool test_buffer_handler_fvcc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = SYNC; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.src_dev_id = routing_table->my_device_id; + packet_header.routing.src_mesh_id = routing_table->my_mesh_id; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ASYNC_WR_RESP; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.misc_parameters.words[1] = 0; + packet_header.packet_parameters.misc_parameters.words[2] = 0; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); + return false; +} + bool test_buffer_handler() { if constexpr (test_command == ASYNC_WR) { return test_buffer_handler_async_wr(); } else if constexpr (test_command == ATOMIC_INC) { return test_buffer_handler_atomic_inc(); + } else if constexpr (test_command == SOCKET_OPEN) { + return test_buffer_handler_fvcc(); } } @@ -284,6 +359,9 @@ void kernel_main() { zero_l1_buf(reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); + client_interface->gk_msg_buf_addr = + (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { input_queue_state.init(src_endpoint_id, prng_seed); @@ -294,6 +372,7 @@ void kernel_main() { } test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); + fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); uint32_t temp = max_packet_size_words; max_packet_size_mask = 0; @@ -364,6 +443,15 @@ void kernel_main() { DPRINT << "Packet Header Corrupted: packet " << packet_count << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); break; + } else if (fvcc_test_producer.get_curr_packet_valid()) { + fvcc_test_producer.fvcc_handler(); +#ifdef CHECK_TIMEOUT + progress_timestamp = get_timestamp_32b(); +#endif + } else if (fvcc_test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); + break; } else if (all_packets_initialized) { DPRINT << "all packets done" << ENDL(); break; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp new file mode 100644 index 00000000000..dfd9a66f274 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -0,0 +1,505 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +// clang-format on + +uint32_t src_endpoint_id; +// constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); +constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); +static_assert(is_power_of_2(num_dest_endpoints), "num_dest_endpoints must be a power of 2"); +constexpr uint32_t dest_endpoint_start_id = get_compile_time_arg_val(2); + +constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); +constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); + +constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); +constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(6); +constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(7); + +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); + +tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); + +constexpr uint32_t prng_seed = get_compile_time_arg_val(10); + +constexpr uint32_t total_data_kb = get_compile_time_arg_val(11); +constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; + +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(12); + +static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); + +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(13); + +constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(14); +constexpr pkt_dest_size_choices_t pkt_dest_size_choice = + static_cast(get_compile_time_arg_val(15)); + +constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(16); +constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(17); +constexpr uint32_t test_command = get_compile_time_arg_val(18); + +constexpr uint32_t base_target_address = get_compile_time_arg_val(19); +uint32_t target_address = base_target_address; + +// atomic increment for the ATOMIC_INC command +constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); +// constexpr uint32_t dest_device = get_compile_time_arg_val(21); +uint32_t dest_device; + +constexpr uint32_t signal_address = get_compile_time_arg_val(21); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(22); +constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(23); + +uint32_t max_packet_size_mask; + +auto input_queue_state = select_input_queue(); +volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); +volatile tt_l1_ptr tt::tt_fabric::fabric_router_l1_config_t* routing_table = + reinterpret_cast(routing_table_start_addr); +volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = + reinterpret_cast(client_pull_req_buf_addr); + +fvc_producer_state_t test_producer __attribute__((aligned(16))); +fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); + +uint64_t xy_local_addr; + +packet_header_t packet_header __attribute__((aligned(16))); + +uint32_t noc_offset; + +// generates packets with random size and payload on the input side +inline bool test_buffer_handler_dsocket_wr(socket_handle_t* socket_handle) { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = test_producer.get_num_words_free(); + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + // Each call to test_buffer_handler initializes only up to the end + // of the producer buffer. Since the header is 3 words, we need to handle + // split header cases, where the buffer has not enough space for the full header. + // In this case, we write as many words as space available, and remaining header words + // are written on next call. + uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_packet( + num_dest_endpoints, + dest_endpoint_start_id, + max_packet_size_words, + max_packet_size_mask, + total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = FORWARD; + packet_header.routing.packet_size_bytes = input_queue_state.curr_packet_size_words * PACKET_WORD_SIZE_BYTES; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.session.command = DSOCKET_WR; + packet_header.session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; + packet_header.session.target_offset_h = socket_handle->pull_notification_adddr >> 32; + target_address += packet_header.routing.packet_size_bytes - PACKET_HEADER_SIZE_BYTES; + packet_header.packet_parameters.misc_parameters.words[1] = input_queue_state.packet_rnd_seed; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + uint32_t words_remaining = words_to_init - words_initialized; + uint32_t packet_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + if (packet_words_initialized < PACKET_HEADER_SIZE_WORDS) { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = packet_words_initialized; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + header_words_to_init = std::min(words_remaining, header_words_to_init); + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + words_remaining = words_to_init - words_initialized; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + if (words_remaining == 0) { + // no space left for packet data. + break; + } + } + + uint32_t num_words = std::min(words_remaining, input_queue_state.curr_packet_words_remaining); + if constexpr (!skip_pkt_content_gen) { + uint32_t start_val = (input_queue_state.packet_rnd_seed & 0xFFFF0000) + + (input_queue_state.curr_packet_size_words - + input_queue_state.curr_packet_words_remaining - PACKET_HEADER_SIZE_WORDS); + fill_packet_data(reinterpret_cast(byte_wr_addr), num_words, start_val); + } + words_initialized += num_words; + input_queue_state.curr_packet_words_remaining -= num_words; + byte_wr_addr += num_words * PACKET_WORD_SIZE_BYTES; + } + } + test_producer.advance_local_wrptr(words_initialized); + return false; +} + +// generates packets with random size and payload on the input side +inline bool test_buffer_handler_atomic_inc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = test_producer.get_num_words_free(); + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = INLINE_FORWARD; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ATOMIC_INC; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.atomic_parameters.wrap_boundary = 31; + packet_header.packet_parameters.atomic_parameters.increment = atomic_increment; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + test_producer.advance_local_wrptr(words_initialized); + return false; +} + +inline bool test_buffer_handler_fvcc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = SYNC; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.src_dev_id = routing_table->my_device_id; + packet_header.routing.src_mesh_id = routing_table->my_mesh_id; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ASYNC_WR_RESP; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.misc_parameters.words[1] = 0; + packet_header.packet_parameters.misc_parameters.words[2] = 0; + + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); + return false; +} + +bool test_buffer_handler(socket_handle_t* socket_handle) { + if constexpr (test_command == DSOCKET_WR) { + return test_buffer_handler_dsocket_wr(socket_handle); + } else if constexpr (test_command == ATOMIC_INC) { + return test_buffer_handler_atomic_inc(); + } else if constexpr (test_command == SOCKET_OPEN) { + return test_buffer_handler_fvcc(); + } + return true; +} + +void kernel_main() { + tt_fabric_init(); + + // TODO: refactor + src_endpoint_id = get_arg_val(0); + noc_offset = get_arg_val(1); + uint32_t router_x = get_arg_val(2); + uint32_t router_y = get_arg_val(3); + dest_device = get_arg_val(4); + + if (ASYNC_WR == test_command) { + target_address = get_arg_val(5); + } + + uint64_t router_config_addr = NOC_XY_ADDR(router_x, router_y, eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); + noc_async_read_one_packet( + router_config_addr, routing_table_start_addr, sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + noc_async_read_barrier(); + + zero_l1_buf(test_results, test_results_size_bytes); + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; + test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; + + test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; + + zero_l1_buf( + reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); + zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); + zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); + zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); + client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; + client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); + client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(); + + if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { + input_queue_state.init(src_endpoint_id, prng_seed); + } else if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE) { + input_queue_state.init(max_packet_size_words, 0); + } else { + input_queue_state.init(src_endpoint_id, prng_seed); + } + + test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); + fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); + + uint32_t temp = max_packet_size_words; + max_packet_size_mask = 0; + temp >>= 1; + while (temp) { + max_packet_size_mask = (max_packet_size_mask << 1) + 1; + temp >>= 1; + } + if ((max_packet_size_mask + 1) != max_packet_size_words) { + // max_packet_size_words is not a power of 2 + // snap to next power of 2 mask + max_packet_size_mask = (max_packet_size_mask << 1) + 1; + } + + // wait till test sends start signal. This is set by test + // once tt_fabric kernels have been launched on all the test devices. + while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); + + test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + + uint64_t data_words_sent = 0; + uint64_t iter = 0; + uint64_t zero_data_sent_iter = 0; + uint64_t few_data_sent_iter = 0; + uint64_t many_data_sent_iter = 0; + uint64_t words_flushed = 0; + bool timeout = false; + uint64_t start_timestamp = get_timestamp(); + uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; + + uint32_t curr_packet_size = 0; + uint32_t curr_packet_words_sent = 0; + uint32_t packet_count = 0; + + socket_handle_t* socket_handle = fabric_socket_open( + 3, // the network plane to use for this socket + 2, // Temporal epoch for which the socket is being opened + 1, // Socket Id to open + SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket + SOCKET_DIRECTION_SEND, // Send or Receive + dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. + dest_device & 0xFFFF, + 0 // fabric virtual channel. + ); + + fabric_socket_connect(socket_handle); + DPRINT << "Socket 1 connected. Handle = " << (uint32_t)socket_handle << ENDL(); + + while (true) { + iter++; +#ifdef CHECK_TIMEOUT + if (timeout_cycles > 0) { + uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; + if (cycles_since_progress > timeout_cycles) { + timeout = true; + break; + } + } +#endif + + bool all_packets_initialized = test_buffer_handler(socket_handle); + + if (test_producer.get_curr_packet_valid()) { + curr_packet_size = + (test_producer.current_packet_header.routing.packet_size_bytes + PACKET_WORD_SIZE_BYTES - 1) >> 4; + uint32_t curr_data_words_sent = test_producer.pull_data_from_fvc_buffer(); + curr_packet_words_sent += curr_data_words_sent; + data_words_sent += curr_data_words_sent; + if constexpr (!(data_sent_per_iter_low == 0 && data_sent_per_iter_high == 0)) { + zero_data_sent_iter += static_cast(curr_data_words_sent <= 0); + few_data_sent_iter += static_cast(curr_data_words_sent <= data_sent_per_iter_low); + many_data_sent_iter += static_cast(curr_data_words_sent >= data_sent_per_iter_high); + } +#ifdef CHECK_TIMEOUT + progress_timestamp = (curr_data_words_sent > 0) ? get_timestamp_32b() : progress_timestamp; +#endif + if (curr_packet_words_sent == curr_packet_size) { + curr_packet_words_sent = 0; + packet_count++; + } + } else if (test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); + break; + } else if (fvcc_test_producer.get_curr_packet_valid()) { + fvcc_test_producer.fvcc_handler(); +#ifdef CHECK_TIMEOUT + progress_timestamp = get_timestamp_32b(); +#endif + } else if (fvcc_test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); + break; + } else if (all_packets_initialized) { + DPRINT << "all packets done" << ENDL(); + break; + } + } + + /* + fabric_socket_open(3, 4, 2, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & 0xFFFF, + 0); fabric_socket_open(3, 6, 3, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & + 0xFFFF, 0); fabric_socket_open(3, 8, 4, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device + & 0xFFFF, 0); fabric_socket_open(3, 10, 5, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, + dest_device & 0xFFFF, 0); + */ + uint64_t cycles_elapsed = get_timestamp() - start_timestamp; + + uint64_t num_packets = input_queue_state.get_num_packets(); + set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); + set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); + set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); + set_64b_result(test_results, zero_data_sent_iter, TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); + set_64b_result(test_results, few_data_sent_iter, TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); + set_64b_result(test_results, many_data_sent_iter, TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); + + if (test_producer.packet_corrupted) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_BAD_HEADER; + test_results[PQ_TEST_MISC_INDEX] = packet_count; + } else if (!timeout) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; + test_results[PQ_TEST_MISC_INDEX] = packet_count; + } else { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; + set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); + } +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 352a3d107eb..bb096660eca 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -23,6 +23,8 @@ int main(int argc, char** argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; + constexpr uint32_t default_gk_x = 0; + constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -155,6 +157,8 @@ int main(int argc, char** argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); + uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); + uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); @@ -219,6 +223,8 @@ int main(int argc, char** argv) { bool pass = true; + CoreCoord gk_core = {gk_x, gk_y}; + std::map defines = { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; @@ -289,6 +295,16 @@ int main(int argc, char** argv) { bool router_core_found = false; CoreCoord router_logical_core; CoreCoord router_phys_core; + CoreCoord gk_phys_core; + uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t gk_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; + uint32_t client_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; + ; + uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); + log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); + log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); + log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); + for (auto device : device_map) { auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.second->id()); std::vector device_router_cores; @@ -326,12 +342,15 @@ int main(int argc, char** argv) { sem_count, // 0: number of active fabric routers router_mask, // 1: active fabric router mask }; + gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); for (auto logical_core : device_router_cores) { std::vector router_compile_args = { - (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words - tunneler_test_results_addr, // 1: test_results_addr - tunneler_test_results_size, // 2: test_results_size - 0, // 3: timeout_cycles + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + gk_interface_addr, // 1: gk_message_addr_l + (gk_phys_core.y << 10) | (gk_phys_core.x << 4), // 2: gk_message_addr_h + tunneler_test_results_addr, // 3: test_results_addr + tunneler_test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles }; auto router_kernel = tt_metal::CreateKernel( program_map[device.first], @@ -348,6 +367,28 @@ int main(int argc, char** argv) { device.first, device.second->ethernet_core_from_logical_core(logical_core)); } + // setup runtime args + log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); + std::vector gk_compile_args = { + gk_interface_addr, // 0: + socket_info_addr, // 1: + routing_table_addr, // 2 + test_results_addr, // 3: test_results_addr + test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + + auto kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = gk_compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, runtime_args); } if (check_txrx_timeout) { @@ -358,6 +399,7 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); + CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); std::vector compile_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id num_dest_endpoints, // 1: num_dest_endpoints @@ -365,8 +407,8 @@ int main(int argc, char** argv) { tx_queue_start_addr, // 3: queue_start_addr_words (tx_queue_size_bytes >> 4), // 4: queue_size_words routing_table_start_addr, // 5: routeing table - 0, // 6: router_x - 0, // 7: router_y + gk_interface_addr, // 6: gk_message_addr_l + (tx_gk_phys_core.y << 10) | (tx_gk_phys_core.x << 4), // 7: gk_message_addr_h test_results_addr, // 8: test_results_addr test_results_size, // 9: test_results_size prng_seed, // 10: prng_seed @@ -380,7 +422,8 @@ int main(int argc, char** argv) { fabric_command, // 18: fabric command target_address, atomic_increment, - tx_signal_address + tx_signal_address, + client_interface_addr, }; @@ -421,10 +464,12 @@ int main(int argc, char** argv) { // Initialize tt_fabric router sync semaphore to 0. Routers on each device // increment once handshake with ethernet peer has been completed. std::vector zero_buf(1, 0); + std::vector gk_zero_buf(12, 0); for (auto [device_id, router_phys_cores] : device_router_map) { for (auto phys_core : router_phys_cores) { tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); } for (uint32_t i = 0; i < num_src_endpoints; i++) { @@ -456,11 +501,10 @@ int main(int argc, char** argv) { log_info(LogTest, "Tx Finished"); - // terminate fabric routers + // terminate gatekeeper. + // Gatekeeper will signal all routers on the device to terminate. for (auto [device_id, router_phys_cores] : device_router_map) { - for (auto phys_core : router_phys_cores) { - tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); - } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); } // wait for all kernels to finish. diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp new file mode 100644 index 00000000000..46309dc79c5 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -0,0 +1,648 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/llrt/rtoptions.hpp" +#include "tt_fabric/control_plane.hpp" +// #include "tt_metal/impl/dispatch/cq_commands.hpp" +// #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "kernels/tt_fabric_traffic_gen_test.hpp" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "tt_metal/hw/inc/wormhole/eth_l1_address_map.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" + +using std::vector; +using namespace tt; +using json = nlohmann::json; + +int main(int argc, char** argv) { + constexpr uint32_t default_tx_x = 0; + constexpr uint32_t default_tx_y = 0; + constexpr uint32_t default_rx_x = 0; + constexpr uint32_t default_rx_y = 3; + constexpr uint32_t default_gk_x = 0; + constexpr uint32_t default_gk_y = 9; + + constexpr uint32_t default_mux_x = 0; + constexpr uint32_t default_mux_y = 1; + constexpr uint32_t default_demux_x = 0; + constexpr uint32_t default_demux_y = 2; + + constexpr uint32_t default_prng_seed = 0x100; + constexpr uint32_t default_data_kb_per_tx = 1024 * 1024; + constexpr uint32_t default_max_packet_size_words = 0x100; + + constexpr uint32_t default_routing_table_start_addr = 0x7EC00; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; + constexpr uint32_t default_tx_queue_size_bytes = 0x10000; + constexpr uint32_t default_rx_queue_start_addr = 0x80000; + constexpr uint32_t default_rx_queue_size_bytes = 0x10000; + constexpr uint32_t default_tx_signal_address = 0x70000; + + constexpr uint32_t default_test_results_addr = 0x100000; + constexpr uint32_t default_test_results_size = 0x40000; + + // TODO: use eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE which should be 0x19900, set test results size back + // to 0x7000 + constexpr uint32_t default_tunneler_queue_size_bytes = + 0x4000; // max fvc size (send + receive. top half is send buffer, bottom half is receive buffer) + constexpr uint32_t default_tunneler_test_results_addr = + 0x39000; // 0x8000 * 4 + 0x19000; 0x10000 * 4 + 0x19000 = 0x59000 > 0x40000 (256kB) + constexpr uint32_t default_tunneler_test_results_size = 0x6000; // 256kB total L1 in ethernet core - 0x39000 + + constexpr uint32_t default_timeout_mcycles = 1000; + constexpr uint32_t default_rx_disable_data_check = 0; + constexpr uint32_t default_rx_disable_header_check = 0; + constexpr uint32_t default_tx_skip_pkt_content_gen = 0; + constexpr uint32_t default_check_txrx_timeout = 1; + + constexpr uint32_t src_endpoint_start_id = 4; + constexpr uint32_t dest_endpoint_start_id = 11; + + constexpr uint32_t num_endpoints = 1; + constexpr uint32_t num_src_endpoints = num_endpoints; + constexpr uint32_t num_dest_endpoints = num_endpoints; + + constexpr uint8_t default_tx_pkt_dest_size_choice = 0; // pkt_dest_size_choices_t + + constexpr uint32_t default_tx_data_sent_per_iter_low = 20; + constexpr uint32_t default_tx_data_sent_per_iter_high = 240; + + constexpr uint32_t default_fabric_command = 1; + + constexpr uint32_t default_dump_stat_json = 0; + constexpr const char* default_output_dir = "/tmp"; + + constexpr uint32_t default_test_device_id_l = 0; + constexpr uint32_t default_test_device_id_r = -1; + + constexpr uint32_t default_target_address = 0x30000; + + constexpr uint32_t default_atomic_increment = 4; + + std::vector input_args(argv, argv + argc); + if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { + log_info(LogTest, "Usage:"); + log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); + log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); + log_info( + LogTest, + " --max_packet_size_words: Max packet size in words, default = 0x{:x}", + default_max_packet_size_words); + log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); + log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); + log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); + log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); + log_info( + LogTest, + " --routing_table_start_addr: Routing Table start address, default = 0x{:x}", + default_routing_table_start_addr); + log_info( + LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); + log_info( + LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); + log_info( + LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); + log_info( + LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); + log_info( + LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); + log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); + log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); + log_info( + LogTest, + " --check_txrx_timeout: Check if timeout happens during tx & rx (if enabled, timeout_mcycles will also be " + "used), default = {}", + default_check_txrx_timeout); + log_info( + LogTest, + " --rx_disable_data_check: Disable data check on RX, default = {}", + default_rx_disable_data_check); + log_info( + LogTest, + " --rx_disable_header_check: Disable header check on RX, default = {}", + default_rx_disable_header_check); + log_info( + LogTest, + " --tx_skip_pkt_content_gen: Skip packet content generation during tx, default = {}", + default_tx_skip_pkt_content_gen); + log_info( + LogTest, + " --tx_pkt_dest_size_choice: choice for how packet destination and packet size are generated, default = " + "{}", + default_tx_pkt_dest_size_choice); // pkt_dest_size_choices_t + log_info( + LogTest, + " --tx_data_sent_per_iter_low: the criteria to determine the amount of tx data sent per iter is low " + "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", + default_tx_data_sent_per_iter_low); + log_info( + LogTest, + " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high " + "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", + default_tx_data_sent_per_iter_high); + log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); + log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info( + LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id_l); + log_info( + LogTest, " --device_id_r: Device on which the test will be run, default = {}", default_test_device_id_r); + return 0; + } + + uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); + uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); + uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); + uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); + uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); + uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); + uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); + uint32_t data_kb_per_tx = + test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); + uint32_t max_packet_size_words = + test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); + uint32_t routing_table_start_addr = test_args::get_command_option_uint32( + input_args, "--routing_table_start_addr", default_routing_table_start_addr); + uint32_t tx_queue_start_addr = + test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); + uint32_t tx_queue_size_bytes = + test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); + uint32_t rx_queue_start_addr = + test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); + uint32_t rx_queue_size_bytes = + test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); + uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( + input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); + uint32_t test_results_addr = + test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); + uint32_t test_results_size = + test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); + uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( + input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); + uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( + input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); + uint32_t timeout_mcycles = + test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); + uint32_t rx_disable_data_check = + test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); + uint32_t rx_disable_header_check = + test_args::get_command_option_uint32(input_args, "--rx_disable_header_check", default_rx_disable_header_check); + uint32_t tx_skip_pkt_content_gen = + test_args::get_command_option_uint32(input_args, "--tx_skip_pkt_content_gen", default_tx_skip_pkt_content_gen); + uint32_t dump_stat_json = + test_args::get_command_option_uint32(input_args, "--dump_stat_json", default_dump_stat_json); + std::string output_dir = test_args::get_command_option(input_args, "--output_dir", std::string(default_output_dir)); + uint32_t check_txrx_timeout = + test_args::get_command_option_uint32(input_args, "--check_txrx_timeout", default_check_txrx_timeout); + uint8_t tx_pkt_dest_size_choice = (uint8_t)test_args::get_command_option_uint32( + input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); + uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32( + input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); + uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32( + input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t fabric_command = + test_args::get_command_option_uint32(input_args, "--fabric_command", default_fabric_command); + uint32_t target_address = + test_args::get_command_option_uint32(input_args, "--target_address", default_target_address); + uint32_t atomic_increment = + test_args::get_command_option_uint32(input_args, "--atomic_increment", default_atomic_increment); + assert( + (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && + rx_disable_header_check || + (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); + + uint32_t test_device_id_l = + test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id_l); + uint32_t test_device_id_r = + test_args::get_command_option_uint32(input_args, "--device_id_r", default_test_device_id_r); + + uint32_t tx_signal_address = default_tx_signal_address; + + bool pass = true; + + CoreCoord gk_core = {gk_x, gk_y}; + + std::map defines = { + {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth + }; + + try { + const std::filesystem::path tg_mesh_graph_desc_path = + std::filesystem::path(tt::llrt::OptionsG.get_root_dir()) / + "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); + + int num_devices = tt_metal::GetNumAvailableDevices(); + if (test_device_id_l >= num_devices) { + log_info( + LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id_l, num_devices - 1); + throw std::runtime_error("Invalid Device Id."); + } + + std::map device_map; + + std::vector chip_ids; + for (unsigned int id = 4; id < 36; id++) { + chip_ids.push_back(id); + } + device_map = tt::tt_metal::detail::CreateDevices(chip_ids); + + log_info(LogTest, "Created {} Devices ...", device_map.size()); + + std::map program_map; + + for (uint32_t i = 4; i < 36; i++) { + program_map[i] = tt_metal::CreateProgram(); + } + + log_info(LogTest, "Created Programs ..."); + + std::map> device_router_map; + + const auto& device_active_eth_cores = device_map[test_device_id_l]->get_active_ethernet_cores(); + + if (device_active_eth_cores.size() == 0) { + log_info( + LogTest, + "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", + test_device_id_l); + for (auto dev : device_map) { + tt_metal::CloseDevice(dev.second); + } + + throw std::runtime_error("Test cannot run on specified device."); + } + + auto [dev_l_mesh_id, dev_l_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_l); + auto [dev_r_mesh_id, dev_r_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_r); + + log_info( + LogTest, + "Running on Left Device {} : Fabric Mesh Id {} : Fabric Device Id {}", + test_device_id_l, + dev_l_mesh_id, + dev_l_chip_id); + log_info( + LogTest, + "Running on Right Device {} : Fabric Mesh Id {} : Fabric Device Id {}", + test_device_id_r, + dev_r_mesh_id, + dev_r_chip_id); + + bool router_core_found = false; + CoreCoord router_logical_core; + CoreCoord router_phys_core; + CoreCoord gk_phys_core; + uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t gk_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; + uint32_t client_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; + uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(tt_fabric_client_interface_t); + uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); + log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); + log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); + log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); + + for (auto device : device_map) { + auto neighbors = device.second->get_ethernet_connected_device_ids(); + std::vector device_router_cores; + std::vector device_router_phys_cores; + uint32_t router_mask = 0; + for (auto neighbor : neighbors) { + if (device_map.contains(neighbor)) { + if (!router_core_found && device.first == test_device_id_l) { + // pick a router so that tx and read in routing tables from this core on the + // sender device. + router_logical_core = device.second->get_ethernet_sockets(neighbor)[0]; + router_phys_core = device.second->ethernet_core_from_logical_core(router_logical_core); + router_core_found = true; + } + auto connected_logical_cores = device.second->get_ethernet_sockets(neighbor); + for (auto logical_core : connected_logical_cores) { + device_router_cores.push_back(logical_core); + device_router_phys_cores.push_back( + device.second->ethernet_core_from_logical_core(logical_core)); + router_mask += 0x1 << logical_core.y; + } + } else { + log_debug( + LogTest, + "Device {} skiping Neighbor Device {} since it is not in test device map.", + device.first, + neighbor); + } + } + + log_info(LogTest, "Device {} router_mask = 0x{:04X}", device.first, router_mask); + uint32_t sem_count = device_router_cores.size(); + device_router_map[device.first] = device_router_phys_cores; + std::vector runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + }; + gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); + for (auto logical_core : device_router_cores) { + std::vector router_compile_args = { + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + gk_interface_addr, // 1: gk_message_addr_l + (gk_phys_core.y << 10) | (gk_phys_core.x << 4), // 2: gk_message_addr_h + tunneler_test_results_addr, // 3: test_results_addr + tunneler_test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + auto router_kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_router.cpp", + logical_core, + tt_metal::EthernetConfig{ + .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, runtime_args); + + log_debug( + LogTest, + "Device {} router added on physical core {}", + device.first, + device.second->ethernet_core_from_logical_core(logical_core)); + } + // setup runtime args + log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); + std::vector gk_compile_args = { + gk_interface_addr, // 0: + socket_info_addr, // 1: + routing_table_addr, // 2 + test_results_addr, // 3: test_results_addr + test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + + auto kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = gk_compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, runtime_args); + } + + if (check_txrx_timeout) { + defines["CHECK_TIMEOUT"] = ""; + } + + std::vector tx_phys_core; + for (uint32_t i = 0; i < num_src_endpoints; i++) { + CoreCoord core = {tx_x + i, tx_y}; + tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); + CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); + std::vector compile_args = { + (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + dest_endpoint_start_id, // 2: + tx_queue_start_addr, // 3: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 4: queue_size_words + routing_table_start_addr, // 5: routeing table + gk_interface_addr, // 6: gk_message_addr_l + (tx_gk_phys_core.y << 10) | (tx_gk_phys_core.x << 4), // 7: gk_message_addr_h + test_results_addr, // 8: test_results_addr + test_results_size, // 9: test_results_size + prng_seed, // 10: prng_seed + data_kb_per_tx, // 11: total_data_kb + max_packet_size_words, // 12: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles + tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high + fabric_command, // 18: fabric command + target_address, + atomic_increment, + tx_signal_address, + client_interface_addr, + client_pull_req_buf_addr, + }; + + // setup runtime args + std::vector runtime_args = { + (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id + 0x410, // 1: dest_noc_offset + router_phys_core.x, + router_phys_core.y, + (dev_r_mesh_id << 16 | dev_r_chip_id)}; + + if (ASYNC_WR == fabric_command) { + runtime_args.push_back(target_address); + } + + log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); + auto kernel = tt_metal::CreateKernel( + program_map[test_device_id_l], + "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp", + {core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[test_device_id_l], kernel, core, runtime_args); + } + + std::vector rx_phys_core; + for (uint32_t i = 0; i < num_dest_endpoints; i++) { + CoreCoord core = {rx_x + i, rx_y}; + rx_phys_core.push_back(device_map[test_device_id_r]->worker_core_from_logical_core(core)); + CoreCoord rx_gk_phys_core = device_map[test_device_id_r]->worker_core_from_logical_core(gk_core); + + std::vector compile_args = { + prng_seed, // 0: prng seed + data_kb_per_tx, // 1: total data kb + max_packet_size_words, // 2: max packet size (in words) + fabric_command, // 3: fabric command + target_address, // 4: target address + atomic_increment, // 5: atomic increment + test_results_addr, // 6: test results addr + test_results_size, // 7: test results size in bytes + gk_interface_addr, // 8: gk_message_addr_l + (rx_gk_phys_core.y << 10) | (rx_gk_phys_core.x << 4), // 9: gk_message_addr_h + client_interface_addr, // 10: + client_pull_req_buf_addr, // 11: + rx_queue_start_addr, // 12: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 13: queue_size_words + }; + + std::vector runtime_args = {(dev_l_mesh_id << 16 | dev_l_chip_id)}; + + log_info(LogTest, "run socket rx at x={},y={}", core.x, core.y); + auto kernel = tt_metal::CreateKernel( + program_map[test_device_id_r], + "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp", + {core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[test_device_id_r], kernel, core, runtime_args); + } + + if (check_txrx_timeout) { + defines.erase("CHECK_TIMEOUT"); + } + + log_info(LogTest, "Starting test..."); + + auto start = std::chrono::system_clock::now(); + + // Initialize tt_fabric router sync semaphore to 0. Routers on each device + // increment once handshake with ethernet peer has been completed. + std::vector zero_buf(1, 0); + std::vector gk_zero_buf(12, 0); + for (auto [device_id, router_phys_cores] : device_router_map) { + for (auto phys_core : router_phys_cores) { + tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); + } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); + } + + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], zero_buf, tx_signal_address); + } + + // clear test status to 0. it will be set to non-zero value by tx kernel. + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[0], zero_buf, test_results_addr); + + for (auto device : device_map) { + log_info(LogTest, "Launching on {}", device.first); + tt_metal::detail::LaunchProgram(device.second, program_map[device.first], false); + } + + // Once all the kernels have been launched on all devices, set the tx_start signal + // to trigger tx kernels to start sending data. + std::vector tx_start(1, 1); + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], tx_start, tx_signal_address); + } + + // Wait for tx to return non-zero status. + while (1) { + auto tx_status = tt::llrt::read_hex_vec_from_core(test_device_id_l, tx_phys_core[0], test_results_addr, 4); + if ((tx_status[0] & 0xFFFF) != 0) { + break; + } + } + + log_info(LogTest, "Tx Finished"); + + // terminate gatekeeper. + // Gatekeeper will signal all routers on the device to terminate. + for (auto [device_id, router_phys_cores] : device_router_map) { + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); + } + + // wait for all kernels to finish. + for (auto device : device_map) { + tt_metal::detail::WaitProgramDone(device.second, program_map[device.first]); + } + + auto end = std::chrono::system_clock::now(); + + std::chrono::duration elapsed_seconds = (end - start); + log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); + + vector> tx_results; + + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tx_results.push_back(tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_l]->id(), tx_phys_core[i], test_results_addr, 128)); + log_info( + LogTest, + "TX{} status = {}", + i, + packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); + pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + } + /* + TODO: Need to add these once control plane api is available to + get the first and last hop fabric router core. + vector router_results = + tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_l]->id(), tunneler_phys_core, tunneler_test_results_addr, 128); + log_info(LogTest, "L Router status = {}", + packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= + (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + + vector r_router_results = + tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_r]->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); + log_info(LogTest, "R Router status = {}", + packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= + (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + */ + for (auto active_device : device_map) { + pass &= tt_metal::CloseDevice(active_device.second); + } + + if (pass) { + double total_tx_bw = 0.0; + uint64_t total_tx_words_sent = 0; + uint64_t total_rx_words_checked = 0; + for (uint32_t i = 0; i < num_src_endpoints; i++) { + uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); + total_tx_words_sent += tx_words_sent; + uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); + double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; + total_tx_bw += tx_bw; + uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); + // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); + // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); + // uint64_t many_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); + // uint64_t num_packets = get_64b_result(tx_results[i], TX_TEST_IDX_NPKT); + // double bytes_per_pkt = static_cast(tx_words_sent) * PACKET_WORD_SIZE_BYTES / + // static_cast(num_packets); + + log_info( + LogTest, + "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", + i, + tx_words_sent, + tx_elapsed_cycles, + tx_bw); + // log_info(LogTest, "TX {} packets sent = {}, bytes/packet = {:.2f}, total iter = {}, zero data sent + // iter = {}, few data sent iter = {}, many data sent iter = {}", i, num_packets, bytes_per_pkt, iter, + // zero_data_sent_iter, few_data_sent_iter, many_data_sent_iter); + /* + stat[fmt::format("tx_words_sent_{}", i)] = tx_words_sent; + stat[fmt::format("tx_elapsed_cycles_{}", i)] = tx_elapsed_cycles; + stat[fmt::format("tx_bw_{}", i)] = tx_bw; + stat[fmt::format("tx_bytes_per_pkt_{}", i)] = bytes_per_pkt; + stat[fmt::format("tx_total_iter_{}", i)] = iter; + stat[fmt::format("tx_zero_data_sent_iter_{}", i)] = zero_data_sent_iter; + stat[fmt::format("tx_few_data_sent_iter_{}", i)] = few_data_sent_iter; + stat[fmt::format("tx_many_data_sent_iter_{}", i)] = many_data_sent_iter; + */ + } + log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); + } + + } catch (const std::exception& e) { + pass = false; + log_fatal(e.what()); + } + + tt::llrt::OptionsG.set_kernels_nullified(false); + + if (pass) { + log_info(LogTest, "Test Passed"); + return 0; + } else { + log_fatal(LogTest, "Test Failed\n"); + return 1; + } +} diff --git a/tt_fabric/hw/inc/routing_table.h b/tt_fabric/hw/inc/routing_table.h index aac39816745..70c862cc009 100644 --- a/tt_fabric/hw/inc/routing_table.h +++ b/tt_fabric/hw/inc/routing_table.h @@ -57,6 +57,7 @@ struct fabric_router_l1_config_t { std::uint16_t my_mesh_id; // Do we need this if we tag routing tables with magic values for outbound eth channels // and route to local NOC? std::uint16_t my_device_id; + std::uint8_t padding[8]; // pad to 16-byte alignment. } __attribute__((packed)); /* diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index d5196c6605a..10032f14961 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -25,6 +25,10 @@ extern volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table; uint64_t tt_fabric_send_pull_request(uint64_t dest_addr, volatile local_pull_request_t* local_pull_request); bool tt_fabric_is_header_valid(packet_header_t* p_header); +uint32_t num_words_available_to_pull(volatile pull_request_t* pull_request); +uint32_t words_before_buffer_wrap(uint32_t buffer_size, uint32_t rd_ptr); +uint32_t advance_ptr(uint32_t buffer_size, uint32_t ptr, uint32_t inc_words); +uint32_t get_rd_ptr_offset_words(pull_request_t* pull_request); inline uint64_t get_timestamp() { uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); @@ -471,7 +475,7 @@ typedef struct fvc_producer_state { } } - template + template inline uint32_t pull_data_from_fvc_buffer() { uint32_t words_available = get_num_words_available(); words_available = std::min(words_available, packet_words_remaining); @@ -493,7 +497,10 @@ typedef struct fvc_producer_state { packet_words_remaining -= words_available; advance_out_rdptr(words_available); // issue noc write to noc target of pull request. - uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START; + uint64_t dest_addr = socket_mode == false + ? ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START + : ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; packet_dest = tt_fabric_send_pull_request(dest_addr, local_pull_request); if (current_packet_header.routing.flags == INLINE_FORWARD) { curr_packet_valid = false; @@ -547,51 +554,59 @@ typedef struct fvc_producer_state { inline uint32_t process_inbound_packet() { uint32_t words_processed = 0; if (packet_is_for_local_chip()) { - if (current_packet_header.routing.flags == FORWARD && current_packet_header.session.command == ASYNC_WR) { - if (packet_in_progress == 0) { - packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; + if (current_packet_header.routing.flags == FORWARD) { + if (current_packet_header.session.command == ASYNC_WR) { + if (packet_in_progress == 0) { + packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; + packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; + advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); + advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); + // subtract the header words. Remaining words are the data to be written to packet_dest. + // Remember to account for trailing bytes which may not be a full packet word. + packet_in_progress = 1; + words_processed = PACKET_HEADER_SIZE_WORDS; + words_processed += issue_async_write(); + } else { + flush_async_writes(); + if (packet_words_remaining) { + words_processed = issue_async_write(); + } else { + packet_in_progress = 0; + curr_packet_valid = false; + packet_timestamp = get_timestamp(); + } + } + } else if (current_packet_header.session.command == DSOCKET_WR) { + words_processed = pull_data_from_fvc_buffer(); + } + } else if (current_packet_header.routing.flags == INLINE_FORWARD) { + if (current_packet_header.session.command == SOCKET_CLOSE) { + words_processed = pull_data_from_fvc_buffer(); + } else { + uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + current_packet_header.packet_parameters.atomic_parameters.increment, + current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, + false); + packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); - // subtract the header words. Remaining words are the data to be written to packet_dest. - // Remember to account for trailing bytes which may not be a full packet word. - packet_in_progress = 1; words_processed = PACKET_HEADER_SIZE_WORDS; - words_processed += issue_async_write(); - } else { - flush_async_writes(); - if (packet_words_remaining) { - words_processed = issue_async_write(); - } else { - packet_in_progress = 0; - curr_packet_valid = false; - packet_timestamp = get_timestamp(); - } + fvc_pull_rdptr = fvc_out_rdptr; + update_remote_rdptr_cleared(); + curr_packet_valid = false; + packet_timestamp = get_timestamp(); } - } else if (current_packet_header.routing.flags == INLINE_FORWARD) { - uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - current_packet_header.packet_parameters.atomic_parameters.increment, - current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, - false); - - packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; - advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); - advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); - words_processed = PACKET_HEADER_SIZE_WORDS; - fvc_pull_rdptr = fvc_out_rdptr; - update_remote_rdptr_cleared(); - curr_packet_valid = false; - packet_timestamp = get_timestamp(); } } else { - words_processed = pull_data_from_fvc_buffer(); + words_processed = pull_data_from_fvc_buffer(); } return words_processed; } @@ -604,6 +619,518 @@ typedef struct fvc_producer_state { } } fvc_producer_state_t; +typedef struct fvcc_outbound_state { + volatile chan_payload_ptr remote_rdptr; + uint32_t remote_ptr_update_addr; + volatile ctrl_chan_msg_buf* + fvcc_buf; // fvcc buffer that receives messages that need to be forwarded over ethernet. + volatile ctrl_chan_sync_buf* fvcc_sync_buf; // sync buffer to hold pointer updates sent over ethernet + uint32_t remote_fvcc_buf_start; + uint32_t out_rdptr; + + // Check if ethernet receiver fvcc on neighboring chip is full. + bool is_remote_fvcc_full() { + uint32_t rd_ptr = remote_rdptr.ptr_cleared; + bool full = false; + if (out_rdptr != rd_ptr) { + uint32_t distance = out_rdptr >= rd_ptr ? out_rdptr - rd_ptr : out_rdptr + 2 * FVCC_BUF_SIZE - rd_ptr; + full = distance >= FVCC_BUF_SIZE; + } + return full; + } + + inline void init(uint32_t buf_start, uint32_t sync_buf_start, uint32_t remote_buf_start, uint32_t ptr_update_addr) { + uint32_t words = sizeof(fvcc_outbound_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; + fvcc_sync_buf = (volatile ctrl_chan_sync_buf*)sync_buf_start; + remote_fvcc_buf_start = remote_buf_start; + remote_ptr_update_addr = ptr_update_addr; + } + + inline uint32_t inc_ptr_with_wrap(uint32_t ptr) { return (ptr + 1) & FVCC_PTR_MASK; } + + inline void advance_out_rdptr() { out_rdptr = inc_ptr_with_wrap(out_rdptr); } + + inline void advance_fvcc_rdptr() { + uint32_t rd_ptr = remote_rdptr.ptr; + while (rd_ptr != fvcc_buf->rdptr.ptr) { + uint32_t msg_index = fvcc_buf->rdptr.ptr & FVCC_SIZE_MASK; + fvcc_buf->msg_buf[msg_index].packet_header.routing.flags = 0; + fvcc_buf->rdptr.ptr = inc_ptr_with_wrap(fvcc_buf->rdptr.ptr); + } + } + + template + inline uint32_t forward_data_from_fvcc_buffer() { + // If receiver ethernet fvcc is full, we cannot send more messages. + if (is_remote_fvcc_full()) { + return 0; + } + + if (fvcc_buf->wrptr.ptr != out_rdptr) { + // There are new messages to forward. + uint32_t msg_index = out_rdptr & FVCC_SIZE_MASK; + volatile packet_header_t* msg = &fvcc_buf->msg_buf[msg_index].packet_header; + bool msg_valid = msg->routing.flags != 0; + if (!msg_valid) { + return 0; + } + + uint32_t dest_addr = + remote_fvcc_buf_start + offsetof(ctrl_chan_msg_buf, msg_buf) + msg_index * sizeof(packet_header_t); + internal_::eth_send_packet( + 0, + (uint32_t)msg / PACKET_WORD_SIZE_BYTES, + dest_addr / PACKET_WORD_SIZE_BYTES, + PACKET_HEADER_SIZE_WORDS); + advance_out_rdptr(); + + uint32_t* sync_ptr = (uint32_t*)&fvcc_sync_buf->ptr[msg_index]; + sync_ptr[0] = out_rdptr; + sync_ptr[1] = 0; + sync_ptr[2] = 0; + sync_ptr[3] = out_rdptr; + internal_::eth_send_packet( + 0, ((uint32_t)sync_ptr) / PACKET_WORD_SIZE_BYTES, remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, 1); + } + + return PACKET_HEADER_SIZE_WORDS; + } + + inline void fvcc_handler() { + forward_data_from_fvcc_buffer(); + advance_fvcc_rdptr(); + } + +} fvcc_outbound_state_t; + +static_assert(sizeof(fvcc_outbound_state_t) % 4 == 0); + +// Fabric Virtual Control Channel (FVCC) Producer receives control/sync packets over ethernet from neighboring chip. +// Data in the producer is either destined for local chip, or has to make a noc hop +// to next outgoing ethernet port enroute to final destination. +// Control packets are forwarded to next fvcc consumer buffer in the route +// direction, if not meant for local device. +// If control packet is addressed to local device, FVCC producer can process the packet locally if +// it is a read/write ack, or forward the packet to Gatekeeper for further local processing. +typedef struct fvcc_inbound_state { + volatile chan_payload_ptr inbound_wrptr; + volatile chan_payload_ptr inbound_rdptr; + uint32_t remote_ptr_update_addr; + volatile ctrl_chan_msg_buf* fvcc_buf; // fvcc buffer that receives incoming control messages over ethernet. + uint8_t chan_num; + uint8_t packet_in_progress; + uint8_t pad1; + uint8_t pad2; + uint32_t fvc_out_wrptr; + uint32_t fvc_out_rdptr; + bool curr_packet_valid; + bool packet_corrupted; + uint64_t packet_timestamp; + uint64_t gk_fvcc_buf_addr; // fvcc buffer in gatekeeper. + volatile packet_header_t* current_packet_header; + + inline void init(uint32_t buf_start, uint32_t ptr_update_addr, uint64_t gk_fvcc_buf_start) { + uint32_t words = sizeof(fvcc_inbound_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + chan_num = 1; + fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; + gk_fvcc_buf_addr = gk_fvcc_buf_start; + remote_ptr_update_addr = ptr_update_addr; + } + + inline uint32_t inc_ptr_with_wrap(uint32_t ptr, uint32_t inc) { return (ptr + inc) & FVCC_PTR_MASK; } + + inline void advance_local_wrptr(uint32_t inc) { inbound_wrptr.ptr = inc_ptr_with_wrap(inbound_wrptr.ptr, inc); } + + inline void advance_out_wrptr(uint32_t inc) { fvc_out_wrptr = inc_ptr_with_wrap(fvc_out_wrptr, inc); } + + inline void advance_out_rdptr(uint32_t inc) { fvc_out_rdptr = inc_ptr_with_wrap(fvc_out_rdptr, inc); } + + inline uint32_t get_num_msgs_available() const { + uint32_t wrptr = inbound_wrptr.ptr; + uint32_t msgs = 0; + if (fvc_out_rdptr != wrptr) { + msgs = wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : wrptr + 2 * FVCC_BUF_SIZE - fvc_out_rdptr; + } + return msgs; + } + + inline uint32_t get_num_msgs_free() { return FVCC_BUF_SIZE - get_num_msgs_available(); } + + inline uint32_t words_before_local_buffer_wrap() { + if (inbound_wrptr.ptr >= FVCC_BUF_SIZE) { + return (FVCC_BUF_SIZE * 2 - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; + } else { + return (FVCC_BUF_SIZE - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; + } + } + + inline bool get_curr_packet_valid() { + if (!curr_packet_valid && (get_num_msgs_available() >= 1)) { + uint32_t msg_index = fvc_out_rdptr & FVCC_SIZE_MASK; + bool msg_valid = fvcc_buf->msg_buf[msg_index].packet_header.routing.flags != 0; + if (msg_valid) { + current_packet_header = (volatile packet_header_t*)&fvcc_buf->msg_buf[msg_index]; + if (tt_fabric_is_header_valid((packet_header_t*)current_packet_header)) { + this->curr_packet_valid = true; + } else { + this->packet_corrupted = true; + } + } + } + return this->curr_packet_valid; + } + + inline uint32_t get_local_buffer_read_addr() { + uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[fvc_out_rdptr & FVCC_SIZE_MASK]; + return addr; + } + + inline uint32_t get_local_buffer_write_addr() { + uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[inbound_wrptr.ptr & FVCC_SIZE_MASK]; + ; + return addr; + } + + template + inline void update_remote_rdptr_sent() { + if (inbound_wrptr.ptr_cleared != inbound_rdptr.ptr) { + inbound_rdptr.ptr = inbound_wrptr.ptr_cleared; + if constexpr (fvc_mode == FVC_MODE_ROUTER) { + internal_::eth_send_packet( + 0, + ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, + remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, + 1); + } + } + } + + template + inline void update_remote_rdptr_cleared() { + if (fvc_out_rdptr != inbound_rdptr.ptr_cleared) { + inbound_rdptr.ptr_cleared = fvc_out_rdptr; + if constexpr (fvc_mode == FVC_MODE_ROUTER) { + internal_::eth_send_packet( + 0, + ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, + remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, + 1); + } + } + } + + uint32_t get_next_hop_router_noc_xy() { + uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; + if (dst_mesh_id != routing_table->my_mesh_id) { + uint32_t next_port = routing_table->inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; + uint32_t next_port = routing_table->intra_mesh_table.dest_entry[dst_device_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } + } + + inline bool packet_is_for_local_chip() { + return (current_packet_header->routing.dst_mesh_id == routing_table->my_mesh_id) && + (current_packet_header->routing.dst_dev_id == routing_table->my_device_id); + } + + // issue a pull request. + // currently blocks till the request queue has space. + // This needs to be non blocking, so that if one fvc pull request queue is full, + // we can process other fvcs and come back to check status of this pull request later. + inline void forward_message(uint64_t dest_addr) { + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&fvcc_buf->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + uint32_t wrptr = fvcc_buf->wrptr.ptr; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)(&fvcc_buf->rdptr.ptr), 4); + noc_async_read_barrier(); + if (!fvcc_buf_ptrs_full(wrptr, fvcc_buf->rdptr.ptr)) { + break; + } +#if defined(COMPILE_FOR_ERISC) + else { + // Consumer pull request buffer is full + // Context switch to enable base firmware routing + // as it might be handling slow dispatch traffic + internal_::risc_context_switch(); + } +#endif + } + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(current_packet_header), noc_addr, sizeof(packet_header_t), noc_index); + } + + template + inline void process_inbound_packet() { + if (packet_is_for_local_chip()) { + if (current_packet_header->routing.flags == SYNC) { + if (current_packet_header->session.command == SOCKET_OPEN or + current_packet_header->session.command == SOCKET_CONNECT) { + // forward socket related messages to gatekeeper. + forward_message(gk_fvcc_buf_addr); + } else if (current_packet_header->session.command == ASYNC_WR_RESP) { + // Write response. Decrement transaction count for respective transaction id. + uint64_t noc_addr = ((uint64_t)current_packet_header->session.target_offset_h << 32) | + current_packet_header->session.target_offset_l; + noc_fast_atomic_increment( + noc_index, NCRISC_AT_CMD_BUF, noc_addr, NOC_UNICAST_WRITE_VC, -1, 31, false); + } + } + } else { + // Control message is not meant for local chip. Forward to next router enroute to destination. + uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FVCC_OUT_BUF_START; + forward_message(dest_addr); + } + curr_packet_valid = false; + advance_out_wrptr(1); + advance_out_rdptr(1); + noc_async_write_barrier(); + update_remote_rdptr_cleared(); + } + + template + inline void fvcc_handler() { + if (get_curr_packet_valid()) { + process_inbound_packet(); + } + update_remote_rdptr_sent(); + } + +} fvcc_inbound_state_t; + +typedef struct socket_reader_state { + volatile chan_payload_ptr remote_rdptr; + uint8_t packet_in_progress; + + uint32_t packet_words_remaining; + uint32_t fvc_out_wrptr; + uint32_t fvc_out_rdptr; + uint32_t fvc_pull_wrptr; + uint32_t buffer_size; + uint32_t buffer_start; + uint32_t remote_buffer_start; + uint32_t pull_words_in_flight; + uint32_t words_since_last_sync; + + uint32_t get_num_words_free() { + uint32_t rd_ptr = remote_rdptr.ptr; + uint32_t words_occupied = 0; + if (fvc_pull_wrptr != rd_ptr) { + words_occupied = + fvc_pull_wrptr > rd_ptr ? fvc_pull_wrptr - rd_ptr : buffer_size * 2 + fvc_pull_wrptr - rd_ptr; + } + return buffer_size - words_occupied; + } + + uint32_t get_remote_num_words_free() { + uint32_t rd_ptr = remote_rdptr.ptr_cleared; + uint32_t words_occupied = 0; + if (fvc_out_wrptr != rd_ptr) { + words_occupied = fvc_out_wrptr > rd_ptr ? fvc_out_wrptr - rd_ptr : buffer_size * 2 + fvc_out_wrptr - rd_ptr; + } + return buffer_size - words_occupied; + } + + inline void init(uint32_t data_buf_start, uint32_t data_buf_size_words) { + uint32_t words = sizeof(socket_reader_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + buffer_start = data_buf_start; + buffer_size = data_buf_size_words; + remote_buffer_start = data_buf_start + buffer_size * PACKET_WORD_SIZE_BYTES; + } + + inline uint32_t words_before_local_buffer_wrap() { + if (fvc_pull_wrptr >= buffer_size) { + return buffer_size * 2 - fvc_pull_wrptr; + } else { + return buffer_size - fvc_pull_wrptr; + } + } + + inline uint32_t get_local_buffer_pull_addr() { + uint32_t addr = buffer_start; + uint32_t offset = fvc_pull_wrptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline uint32_t get_local_buffer_read_addr() { + uint32_t addr = buffer_start; + uint32_t offset = fvc_out_rdptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline uint32_t get_remote_buffer_write_addr() { + uint32_t addr = remote_buffer_start; + uint32_t offset = fvc_out_wrptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline void advance_pull_wrptr(uint32_t num_words) { + uint32_t temp = fvc_pull_wrptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_pull_wrptr = temp; + } + + inline void advance_out_wrptr(uint32_t num_words) { + uint32_t temp = fvc_out_wrptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_out_wrptr = temp; + } + + inline void advance_out_rdptr(uint32_t num_words) { + uint32_t temp = fvc_out_rdptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_out_rdptr = temp; + } + + inline void register_pull_data(uint32_t num_words_to_pull) { + pull_words_in_flight += num_words_to_pull; + advance_pull_wrptr(num_words_to_pull); + words_since_last_sync += num_words_to_pull; + packet_words_remaining -= num_words_to_pull; + } + + inline uint32_t get_num_words_to_pull(volatile pull_request_t* pull_request) { + uint32_t num_words_to_pull = num_words_available_to_pull(pull_request); + uint32_t num_words_before_wrap = words_before_buffer_wrap(pull_request->buffer_size, pull_request->rd_ptr); + + num_words_to_pull = std::min(num_words_to_pull, num_words_before_wrap); + uint32_t socket_buffer_space = get_num_words_free(); + num_words_to_pull = std::min(num_words_to_pull, socket_buffer_space); + + if (num_words_to_pull == 0) { + return 0; + } + + uint32_t space_before_wptr_wrap = words_before_local_buffer_wrap(); + num_words_to_pull = std::min(num_words_to_pull, space_before_wptr_wrap); + + return num_words_to_pull; + } + + inline uint32_t pull_socket_data(volatile pull_request_t* pull_request) { + volatile uint32_t* temp = (volatile uint32_t*)0xffb2010c; + if (packet_in_progress == 0) { + uint32_t size = pull_request->size; + packet_words_remaining = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; + packet_in_progress = 1; + } + + uint32_t num_words_to_pull = get_num_words_to_pull(pull_request); + if (num_words_to_pull == 0) { + temp[0] = 0xdead1111; + return 0; + } + + uint32_t rd_offset = get_rd_ptr_offset_words((pull_request_t*)pull_request); + uint64_t src_addr = pull_request->buffer_start + (rd_offset * PACKET_WORD_SIZE_BYTES); + uint32_t local_addr = get_local_buffer_pull_addr(); + + // pull_data_from_remote(); + noc_async_read(src_addr, local_addr, num_words_to_pull * PACKET_WORD_SIZE_BYTES); + register_pull_data(num_words_to_pull); + pull_request->rd_ptr = advance_ptr(pull_request->buffer_size, pull_request->rd_ptr, num_words_to_pull); + + return num_words_to_pull; + } + + template + inline uint32_t push_socket_data() { + uint32_t total_words_to_forward = 0; + uint32_t wrptr = fvc_pull_wrptr; + + total_words_to_forward = + wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : buffer_size * 2 + wrptr - fvc_out_rdptr; + + uint32_t remote_fvc_buffer_space = get_remote_num_words_free(); + total_words_to_forward = std::min(total_words_to_forward, remote_fvc_buffer_space); + if (total_words_to_forward == 0) { + return 0; + } + + if (packet_words_remaining and (words_since_last_sync < FVC_SYNC_THRESHOLD)) { + // not enough data to forward. + // wait for more data. + return 0; + } + + if constexpr (live == true) { + uint32_t src_addr = 0; + uint32_t dest_addr = 0; // should be second half of fvc buffer. + uint32_t words_remaining = total_words_to_forward; + while (words_remaining) { + uint32_t num_words_before_local_wrap = words_before_buffer_wrap(buffer_size, fvc_out_rdptr); + uint32_t num_words_before_remote_wrap = words_before_buffer_wrap(buffer_size, fvc_out_wrptr); + uint32_t words_to_forward = std::min(num_words_before_local_wrap, num_words_before_remote_wrap); + words_to_forward = std::min(words_to_forward, words_remaining); + // max 8K bytes + words_to_forward = std::min(words_to_forward, DEFAULT_MAX_NOC_SEND_WORDS); + src_addr = get_local_buffer_read_addr(); + dest_addr = get_remote_buffer_write_addr(); + + // TODO: Issue a noc write here to forward data. + + advance_out_rdptr(words_to_forward); + advance_out_wrptr(words_to_forward); + words_remaining -= words_to_forward; + } + } else { + advance_out_rdptr(total_words_to_forward); + advance_out_wrptr(total_words_to_forward); + remote_rdptr.ptr = fvc_out_rdptr; + remote_rdptr.ptr_cleared = fvc_out_wrptr; + } + words_since_last_sync -= total_words_to_forward; + return total_words_to_forward; + } +} socket_reader_state_t; + +static_assert(sizeof(socket_reader_state_t) % 4 == 0); typedef struct router_state { uint32_t sync_in; @@ -615,28 +1142,6 @@ typedef struct router_state { inline uint64_t get_timestamp_32b() { return reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); } -void tt_fabric_add_header_checksum(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - p_header->packet_parameters.misc_parameters.words[0] = sum; -} - -bool tt_fabric_is_header_valid(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - return (p_header->packet_parameters.misc_parameters.words[0] == sum); -} - void zero_l1_buf(tt_l1_ptr uint32_t* buf, uint32_t size_bytes) { for (uint32_t i = 0; i < size_bytes / 4; i++) { buf[i] = 0; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index 6978f58ff9d..98a4037fb29 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -9,17 +9,12 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "tt_fabric.hpp" +#include "tt_fabric_interface.h" -typedef struct tt_fabric_endpoint_sync { - uint32_t sync_addr : 24; - uint32_t endpoint_type : 8 -} tt_fabric_endpoint_sync_t; - -static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); - -extern local_pull_request_t local_pull_request; +extern volatile local_pull_request_t* local_pull_request; +extern volatile tt_fabric_client_interface_t* client_interface; +/* inline void fabric_async_write( uint32_t routing_plane, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory @@ -40,56 +35,125 @@ inline void fabric_async_write( uint64_t router_request_queue = NOC_XY_ADDR(2, 0, 0x19000); tt_fabric_send_pull_request(router_request_queue, &local_pull_request); } +*/ -/** - * Polling for ready signal from the remote peers of all input and output queues. - * Blocks until all are ready, but doesn't block polling on each individual queue. - * Returns false in case of timeout. - */ -bool wait_all_src_dest_ready(volatile router_state_t* router_state, uint32_t timeout_cycles = 0) { - bool src_ready = false; - bool dest_ready = false; +inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { + if (dst_mesh_id != routing_table[routing_plane].my_mesh_id) { + uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_dev_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } +} - uint32_t iters = 0; +inline void send_message_to_gk() { + uint64_t gk_noc_base = client_interface->gk_msg_buf_addr; + uint64_t noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&client_interface->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + uint32_t wrptr = client_interface->wrptr.ptr; + noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, rdptr); + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)(&client_interface->rdptr.ptr), 4); + noc_async_read_barrier(); + if (!fvcc_buf_ptrs_full(wrptr, client_interface->rdptr.ptr)) { + break; + } + } + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(&client_interface->gk_message), noc_addr, sizeof(packet_header_t), noc_index); + noc_async_write_barrier(); +} - uint32_t start_timestamp = get_timestamp_32b(); - uint32_t sync_in_addr = ((uint32_t)&router_state->sync_in) / PACKET_WORD_SIZE_BYTES; - uint32_t sync_out_addr = ((uint32_t)&router_state->sync_out) / PACKET_WORD_SIZE_BYTES; +inline socket_handle_t* fabric_socket_open( + uint32_t routing_plane, // the network plane to use for this socket + uint16_t epoch_id, // Temporal epoch for which the socket is being opened + uint16_t socket_id, // Socket Id to open + uint8_t socket_type, // Unicast, Multicast, SSocket, DSocket + uint8_t direction, // Send or Receive + uint16_t remote_mesh_id, // Remote mesh/device that is the socket data sender/receiver. + uint16_t remote_dev_id, + uint8_t fvc // fabric virtual channel. +) { + uint32_t socket_count = client_interface->socket_count; + socket_handle_t* socket_handle = (socket_handle_t*)&client_interface->socket_handles[socket_count]; + socket_count++; + client_interface->socket_count = socket_count; + socket_handle->socket_state = SocketState::OPENING; - uint32_t scratch_addr = ((uint32_t)&router_state->scratch) / PACKET_WORD_SIZE_BYTES; - router_state->scratch[0] = 0xAA; - // send_buf[1] = 0x0; - // send_buf[2] = 0x0; - // send_buf[3] = 0x0; + if (direction == SOCKET_DIRECTION_SEND) { + client_interface->gk_message.packet_header.routing.dst_mesh_id = remote_mesh_id; + client_interface->gk_message.packet_header.routing.dst_dev_id = remote_dev_id; + } else { + client_interface->gk_message.packet_header.routing.src_mesh_id = remote_mesh_id; + client_interface->gk_message.packet_header.routing.src_dev_id = remote_dev_id; + } + client_interface->gk_message.packet_header.routing.flags = SYNC; + client_interface->gk_message.packet_header.session.command = SOCKET_OPEN; + client_interface->gk_message.packet_header.session.target_offset_h = client_interface->pull_req_buf_addr >> 32; + client_interface->gk_message.packet_header.session.target_offset_l = (uint32_t)client_interface->pull_req_buf_addr; + client_interface->gk_message.packet_header.session.ack_offset_h = xy_local_addr >> 32; + client_interface->gk_message.packet_header.session.ack_offset_l = (uint32_t)socket_handle; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_id = socket_id; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.epoch_id = epoch_id; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_type = socket_type; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_direction = direction; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.routing_plane = routing_plane; + tt_fabric_add_header_checksum((packet_header_t*)&client_interface->gk_message.packet_header); + send_message_to_gk(); + return socket_handle; +} - while (!src_ready or !dest_ready) { - if (router_state->sync_out != 0xAA) { - internal_::eth_send_packet(0, scratch_addr, sync_in_addr, 1); - } else { - dest_ready = true; - } +inline void fabric_socket_close(socket_handle_t* socket_handle) { + packet_header_t* packet_header = (packet_header_t*)&client_interface->gk_message.packet_header; + uint32_t dst_mesh_id = socket_handle->rcvr_mesh_id; + uint32_t dst_dev_id = socket_handle->rcvr_dev_id; + packet_header->routing.flags = INLINE_FORWARD; + packet_header->routing.dst_mesh_id = dst_mesh_id; + packet_header->routing.dst_dev_id = dst_dev_id; + packet_header->routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header->session.command = SOCKET_CLOSE; + packet_header->session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; + packet_header->session.target_offset_h = socket_handle->pull_notification_adddr >> 32; + tt_fabric_add_header_checksum(packet_header); - if (!src_ready && router_state->sync_in == 0xAA) { - internal_::eth_send_packet(0, sync_in_addr, sync_out_addr, 1); - src_ready = true; - } + uint32_t* dst = (uint32_t*)&client_interface->local_pull_request.pull_request; + uint32_t* src = (uint32_t*)packet_header; + for (uint32_t i = 0; i < sizeof(pull_request_t) / 4; i++) { + dst[i] = src[i]; + } + uint64_t dest_addr = + ((uint64_t)get_next_hop_router_noc_xy(socket_handle->routing_plane, dst_mesh_id, dst_dev_id) << 32) | + FABRIC_ROUTER_REQ_QUEUE_START; + tt_fabric_send_pull_request(dest_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); +} - iters++; - if (timeout_cycles > 0) { - uint32_t cycles_since_start = get_timestamp_32b() - start_timestamp; - if (cycles_since_start > timeout_cycles) { - return false; - } - } +inline void fabric_socket_connect(socket_handle_t* socket_handle) { + // wait for socket state to change to Active. + // Gatekeeper will update local socket handle when the receiver for send socket + // is ready. + while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); +} -#if defined(COMPILE_FOR_ERISC) - if ((timeout_cycles == 0) && (iters & 0xFFF) == 0) { - // if timeout is disabled, context switch every 4096 iterations. - // this is necessary to allow ethernet routing layer to operate. - // this core may have pending ethernet routing work. - internal_::risc_context_switch(); +inline void fabric_endpoint_init() { + uint64_t noc_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, ep_sync); + client_interface->return_status[0] = 0; + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)&client_interface->return_status[0], 4); + noc_async_read_barrier(); + if (client_interface->return_status[0] != 0) { + break; } -#endif } - return true; } diff --git a/tt_fabric/hw/inc/tt_fabric_interface.h b/tt_fabric/hw/inc/tt_fabric_interface.h index affefa22e20..f84c859dad3 100644 --- a/tt_fabric/hw/inc/tt_fabric_interface.h +++ b/tt_fabric/hw/inc/tt_fabric_interface.h @@ -16,8 +16,7 @@ static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t NUM_WR_CMD_BUFS = 4; -constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = - (NUM_WR_CMD_BUFS - 1) * (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; +constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024; constexpr uint32_t FVC_SYNC_THRESHOLD = 256; @@ -30,6 +29,9 @@ enum SessionCommand : uint32_t { SSOCKET_WR = (0x1 << 5), ATOMIC_INC = (0x1 << 6), ATOMIC_READ_INC = (0x1 << 7), + SOCKET_OPEN = (0x1 << 8), + SOCKET_CLOSE = (0x1 << 9), + SOCKET_CONNECT = (0x1 << 10), }; #define INVALID 0x0 @@ -76,7 +78,13 @@ typedef struct mcast_params { } mcast_params; typedef struct socket_params { - uint32_t socket_id; + uint32_t padding1; + uint16_t socket_id; + uint16_t epoch_id; + uint8_t socket_type; + uint8_t socket_direction; + uint8_t routing_plane; + uint8_t padding; } socket_params; typedef struct atomic_params { @@ -117,6 +125,28 @@ const uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD static_assert(sizeof(packet_header) == PACKET_HEADER_SIZE_BYTES); +void tt_fabric_add_header_checksum(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + p_header->packet_parameters.misc_parameters.words[0] = sum; +} + +bool tt_fabric_is_header_valid(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + return (p_header->packet_parameters.misc_parameters.words[0] == sum); +} + // This is a pull request entry for a fabric router. // Pull request issuer populates these entries to identify // the data that fabric router needs to pull from requestor. @@ -188,11 +218,127 @@ typedef struct chan_payload_ptr { static_assert(sizeof(chan_payload_ptr) == CHAN_PTR_SIZE_BYTES); +// Fabric Virtual Control Channel (FVCC) parameters. +// Each control channel message is 48 Bytes. +// FVCC buffer is a 16 message buffer each for incoming and outgoing messages. +// Control message capacity can be increased by increasing FVCC_BUF_SIZE. +const uint32_t FVCC_BUF_SIZE = 16; // must be 2^N +const uint32_t FVCC_BUF_LOG_SIZE = 4; // must be log2(FVCC_BUF_SIZE) +const uint32_t FVCC_SIZE_MASK = (FVCC_BUF_SIZE - 1); +const uint32_t FVCC_PTR_MASK = ((FVCC_BUF_SIZE << 1) - 1); +const uint32_t FVCC_BUF_SIZE_BYTES = PULL_REQ_SIZE_BYTES * FVCC_BUF_SIZE + 2 * CHAN_PTR_SIZE_BYTES; +const uint32_t FVCC_SYNC_BUF_SIZE_BYTES = CHAN_PTR_SIZE_BYTES * FVCC_BUF_SIZE; + +inline bool fvcc_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } + +inline bool fvcc_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { + uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; + return !fvcc_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); +} + +// out_req_buf has 16 byte additional storage per entry to hold the outgoing +// write pointer update. this is sent over ethernet. +// For incoming requests over ethernet, we only need storate for the request +// entry. The pointer update goes to fvcc state. +typedef struct ctrl_chan_msg_buf { + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t msg_buf[FVCC_BUF_SIZE]; +} ctrl_chan_msg_buf; + +typedef struct ctrl_chan_sync_buf { + chan_payload_ptr ptr[FVCC_BUF_SIZE]; +} ctrl_chan_sync_buf; + +static_assert(sizeof(ctrl_chan_msg_buf) == FVCC_BUF_SIZE_BYTES); + +typedef struct sync_word { + uint32_t val; + uint32_t padding[3]; +} sync_word_t; + +typedef struct gatekeeper_info { + sync_word_t router_sync; + sync_word_t ep_sync; + uint32_t routing_planes; + uint32_t padding[3]; + ctrl_chan_msg_buf gk_msg_buf; +} gatekeeper_info_t; + +#define SOCKET_DIRECTION_SEND 1 +#define SOCKET_DIRECTION_RECV 2 +#define SOCKET_TYPE_DGRAM 1 +#define SOCKET_TYPE_STREAM 2 + +enum SocketState : uint8_t { + IDLE = 0, + OPENING = 1, + ACTIVE = 2, + CLOSING = 3, +}; + +typedef struct socket_handle { + uint16_t socket_id; + uint16_t epoch_id; + uint8_t socket_state; + uint8_t socket_type; + uint8_t socket_direction; + uint8_t rcvrs_ready; + uint32_t routing_plane; + uint16_t sender_mesh_id; + uint16_t sender_dev_id; + uint16_t rcvr_mesh_id; + uint16_t rcvr_dev_id; + uint32_t sender_handle; + uint64_t pull_notification_adddr; + uint64_t status_notification_addr; + uint32_t padding[2]; +} socket_handle_t; + +static_assert(sizeof(socket_handle_t) % 16 == 0); + +constexpr uint32_t MAX_SOCKETS = 64; +typedef struct socket_info { + uint32_t socket_count; + uint32_t socket_setup_pending; + uint32_t padding[2]; + socket_handle_t sockets[MAX_SOCKETS]; + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t gk_message; +} socket_info_t; +static_assert(sizeof(socket_info_t) % 16 == 0); + +typedef struct tt_fabric_client_interface { + uint64_t gk_interface_addr; + uint64_t gk_msg_buf_addr; + uint64_t pull_req_buf_addr; + uint32_t padding[2]; + uint32_t return_status[3]; + uint32_t socket_count; + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t gk_message; + local_pull_request_t local_pull_request; + socket_handle_t socket_handles[MAX_SOCKETS]; +} tt_fabric_client_interface_t; + +static_assert(sizeof(tt_fabric_client_interface_t) % 16 == 0); + constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM = FABRIC_ROUTER_MISC_START; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM_SIZE = 16; -constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; +// Fabric Virtual Control Channel start/size +constexpr uint32_t FVCC_OUT_BUF_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; +constexpr uint32_t FVCC_OUT_BUF_SIZE = FVCC_BUF_SIZE_BYTES; +constexpr uint32_t FVCC_SYNC_BUF_START = FVCC_OUT_BUF_START + FVCC_OUT_BUF_SIZE; +constexpr uint32_t FVCC_SYNC_BUF_SIZE = FVCC_SYNC_BUF_SIZE_BYTES; +constexpr uint32_t FVCC_IN_BUF_START = FVCC_SYNC_BUF_START + FVCC_SYNC_BUF_SIZE; +constexpr uint32_t FVCC_IN_BUF_SIZE = FVCC_BUF_SIZE_BYTES; + +// Fabric Virtual Channel start/size +constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FVCC_IN_BUF_START + FVCC_IN_BUF_SIZE; constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_SIZE = sizeof(chan_req_buf); constexpr uint32_t FABRIC_ROUTER_DATA_BUF_START = FABRIC_ROUTER_REQ_QUEUE_START + FABRIC_ROUTER_REQ_QUEUE_SIZE; diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp new file mode 100644 index 00000000000..7a4cc07e258 --- /dev/null +++ b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp @@ -0,0 +1,506 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "tt_metal/hw/inc/dataflow_api.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "debug/dprint.h" +// clang-format on + +constexpr uint32_t gatekeeper_info_addr = get_compile_time_arg_val(0); +constexpr uint32_t socket_info_addr = get_compile_time_arg_val(1); +constexpr uint32_t routing_table_addr = get_compile_time_arg_val(2); +constexpr uint32_t kernel_status_buf_addr = get_compile_time_arg_val(3); +constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(4); +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(5); +uint32_t sync_val; +uint32_t router_mask; + +constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; +constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; +constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; +constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; +constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; +constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; + +// indexes of return values in test results buffer +constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; +constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; +constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; +constexpr uint32_t PQ_TEST_ITER_INDEX = 6; +constexpr uint32_t PQ_TEST_MISC_INDEX = 16; + +// careful, may be null +tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr); +volatile tt_l1_ptr tt::tt_fabric::fabric_router_l1_config_t* routing_table = + reinterpret_cast(routing_table_addr); + +volatile gatekeeper_info_t* gk_info = (volatile gatekeeper_info_t*)gatekeeper_info_addr; +volatile socket_info_t* socket_info = (volatile socket_info_t*)socket_info_addr; + +uint64_t xy_local_addr; +uint64_t router_addr; +uint32_t gk_message_pending; + +inline void notify_all_routers(uint32_t notification) { + uint32_t remaining_cores = router_mask; + for (uint32_t i = 0; i < 16; i++) { + if (remaining_cores == 0) { + break; + } + if (remaining_cores & (0x1 << i)) { + uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; + noc_inline_dw_write(dest_addr, notification); + remaining_cores &= ~(0x1 << i); + } + } +} + +inline void sync_all_routers() { + // wait for all device routers to have incremented the sync semaphore. + // sync_val is equal to number of tt-fabric routers running on a device. + while (gk_info->router_sync.val != sync_val); + + // send semaphore increment to all fabric routers on this device. + // semaphore notifies all other routers that this router has completed + // startup handshake with its ethernet peer. + notify_all_routers(sync_val); + gk_info->ep_sync.val = sync_val; +} + +inline void gk_msg_buf_ptr_advance(chan_ptr* ptr) { ptr->ptr = (ptr->ptr + 1) & FVCC_PTR_MASK; } + +inline void gk_msg_buf_advance_wrptr(ctrl_chan_msg_buf* msg_buf) { gk_msg_buf_ptr_advance(&(msg_buf->wrptr)); } + +inline void gk_msg_buf_advance_rdptr(ctrl_chan_msg_buf* msg_buf) { + // clear valid before incrementing read pointer. + uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + msg_buf->msg_buf[rd_index].bytes[47] = 0; + gk_msg_buf_ptr_advance(&(msg_buf->rdptr)); +} + +inline bool gk_msg_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } + +inline bool gk_msg_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { + uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; + return !gk_msg_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); +} + +inline bool gk_msg_buf_is_empty(const volatile ctrl_chan_msg_buf* msg_buf) { + return gk_msg_buf_ptrs_empty(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); +} + +inline bool gk_msg_buf_is_full(const volatile ctrl_chan_msg_buf* msg_buf) { + return gk_msg_buf_ptrs_full(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); +} + +inline bool gk_msg_valid(const volatile ctrl_chan_msg_buf* msg_buf) { + uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + return msg_buf->msg_buf[rd_index].packet_header.routing.flags != 0; +} + +inline socket_handle_t* socket_receiver_available(packet_header_t* packet) { + socket_handle_t* handle = 0; + bool sender_found = false; + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { + handle = (socket_handle_t*)&socket_info->sockets[i]; + if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { + continue; + } + if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { + continue; + } + if (handle->sender_dev_id != routing_table[0].my_device_id) { + continue; + } + if (handle->sender_mesh_id != routing_table[0].my_mesh_id) { + continue; + } + if (handle->rcvr_dev_id != packet->routing.dst_dev_id) { + continue; + } + if (handle->rcvr_mesh_id != packet->routing.dst_mesh_id) { + continue; + } + sender_found = true; + break; + } + } + return sender_found ? handle : 0; +} +inline void set_socket_active(socket_handle_t* handle) { + handle->socket_state = SocketState::ACTIVE; + // send socket connection state to send socket opener. + noc_async_write_one_packet( + (uint32_t)(handle), handle->status_notification_addr, sizeof(socket_handle_t), noc_index); +} + +inline void socket_open(packet_header_t* packet) { + socket_handle_t* handle = 0; + if (packet->packet_parameters.socket_parameters.socket_direction == SOCKET_DIRECTION_SEND) { + handle = socket_receiver_available(packet); + if (handle) { + // If remote receive socket already opened, + // set send socket state to active and return. + handle->status_notification_addr = + ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; + set_socket_active(handle); + DPRINT << "GK: Receiver Available " << (uint32_t)handle->socket_id << ENDL(); + return; + } + } + + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == 0) { + // idle socket handle. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; + handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; + handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; + handle->socket_direction = packet->packet_parameters.socket_parameters.socket_direction; + handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; + handle->status_notification_addr = + ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; + handle->socket_state = SocketState::OPENING; + + if (handle->socket_direction == SOCKET_DIRECTION_RECV) { + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->sender_dev_id = packet->routing.src_dev_id; + handle->sender_mesh_id = packet->routing.src_mesh_id; + handle->rcvr_dev_id = routing_table[0].my_device_id; + handle->rcvr_mesh_id = routing_table[0].my_mesh_id; + socket_info->socket_setup_pending++; + } else { + handle->sender_dev_id = routing_table[0].my_device_id; + handle->sender_mesh_id = routing_table[0].my_mesh_id; + handle->rcvr_dev_id = packet->routing.dst_dev_id; + handle->rcvr_mesh_id = packet->routing.dst_mesh_id; + } + DPRINT << "GK: Socket Opened : " << (uint32_t)handle->socket_id << ENDL(); + break; + } + } +} + +inline void socket_close(packet_header_t* packet) { + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + bool found = socket_info->sockets[i].socket_id == packet->packet_parameters.socket_parameters.socket_id && + socket_info->sockets[i].epoch_id == packet->packet_parameters.socket_parameters.epoch_id; + if (found) { + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + // handle->socket_id = 0; + // handle->epoch_id = 0; + // handle->socket_type = 0; + // handle->socket_direction = 0; + handle->pull_notification_adddr = 0; + handle->socket_state = SocketState::CLOSING; + socket_info->socket_setup_pending++; + break; + } + } +} + +inline void socket_open_for_connect(packet_header_t* packet) { + // open a send socket in response to a connect message received from a + // socket receiver. + // Local device has not yet opened a send socket, but we need to update + // local socket state. + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == 0) { + // idle socket handle. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; + handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; + handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; + handle->socket_direction = SOCKET_DIRECTION_SEND; + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; + handle->socket_state = SocketState::OPENING; + handle->sender_dev_id = routing_table[0].my_device_id; + handle->sender_mesh_id = routing_table[0].my_mesh_id; + handle->rcvr_dev_id = packet->routing.src_dev_id; + handle->rcvr_mesh_id = packet->routing.src_mesh_id; + DPRINT << "GK: Opened Send Socket for Remote " << (uint32_t)handle->socket_id << ENDL(); + + break; + } + } +} + +inline void socket_connect(packet_header_t* packet) { + // connect messsage is initiated by a receiving socket. + // find a matching send socket in local state. + bool found_sender = false; + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { + continue; + } + if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { + continue; + } + if (handle->sender_dev_id != packet->routing.dst_dev_id) { + continue; + } + if (handle->sender_mesh_id != packet->routing.dst_mesh_id) { + continue; + } + if (handle->rcvr_dev_id != packet->routing.src_dev_id) { + continue; + } + if (handle->rcvr_mesh_id != packet->routing.src_mesh_id) { + continue; + } + found_sender = true; + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->socket_state = SocketState::ACTIVE; + set_socket_active(handle); + DPRINT << "GK: Found Send Socket " << (uint32_t)handle->socket_id << ENDL(); + break; + } + } + + if (!found_sender) { + // No send socket opened yet. + // Device has not made it to the epoch where a send socket is opened. + // Log the receive socket connect request. + socket_open_for_connect(packet); + } +} + +uint32_t get_next_hop_router_noc_xy(packet_header_t* current_packet_header, uint32_t routing_plane) { + uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; + if (dst_mesh_id != routing_table->my_mesh_id) { + uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; + uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_device_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } +} + +inline bool send_gk_message(uint64_t dest_addr, packet_header_t* packet) { + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&socket_info->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + + uint32_t wrptr = socket_info->wrptr.ptr; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + + noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); + noc_async_read_barrier(); + if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { + return false; + } + + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); + return true; +} + +inline bool retry_gk_message(uint64_t dest_addr, packet_header_t* packet) { + uint32_t wrptr = socket_info->wrptr.ptr; + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + + noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); + noc_async_read_barrier(); + if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { + return false; + } + + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); + gk_message_pending = 0; + return true; +} + +inline void process_pending_socket() { + if (socket_info->socket_setup_pending == 0) { + // there is no pending socket setup work. + // or there is a pending message to be sent out + return; + } + + // Check write flush of previous gk message write. + // TODO. + + chan_request_entry_t* message = (chan_request_entry_t*)&socket_info->gk_message; + + if (gk_message_pending == 1) { + if (retry_gk_message(router_addr, &message->packet_header)) { + // decrement pending count. + // if there is more pending socket work, will pick up on next iteration. + socket_info->socket_setup_pending--; + } + } else { + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_RECV) { + // send a message on fvcc to socket data sender. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + message->packet_header.routing.dst_dev_id = handle->sender_dev_id; + message->packet_header.routing.dst_mesh_id = handle->sender_mesh_id; + message->packet_header.routing.src_dev_id = handle->rcvr_dev_id; + message->packet_header.routing.src_mesh_id = handle->rcvr_mesh_id; + message->packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + message->packet_header.routing.flags = SYNC; + message->packet_header.session.command = SOCKET_CONNECT; + message->packet_header.session.target_offset_h = handle->pull_notification_adddr >> 32; + message->packet_header.session.target_offset_l = (uint32_t)handle->pull_notification_adddr; + message->packet_header.packet_parameters.socket_parameters.socket_id = handle->socket_id; + message->packet_header.packet_parameters.socket_parameters.epoch_id = handle->epoch_id; + message->packet_header.packet_parameters.socket_parameters.socket_type = handle->socket_type; + message->packet_header.packet_parameters.socket_parameters.socket_direction = SOCKET_DIRECTION_RECV; + message->packet_header.packet_parameters.socket_parameters.routing_plane = handle->routing_plane; + tt_fabric_add_header_checksum((packet_header_t*)&message->packet_header); + + router_addr = + ((uint64_t)get_next_hop_router_noc_xy(&message->packet_header, handle->routing_plane) << 32) | + FVCC_OUT_BUF_START; + + if (send_gk_message(router_addr, &message->packet_header)) { + DPRINT << "GK: Sending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); + handle->socket_state = SocketState::ACTIVE; + socket_info->socket_setup_pending--; + } else { + DPRINT << "GK: Pending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); + // gatekeeper message buffer is full. need to retry on next iteration. + gk_message_pending = 1; + } + } + } + } +} + +inline void get_routing_tables() { + uint32_t temp_mask = router_mask; + uint32_t channel = 0; + uint32_t routing_plane = 0; + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0xF) { + temp_mask &= 0xF; + break; + } else { + temp_mask >>= 4; + } + channel += 4; + } + + if (temp_mask) { + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0x1) { + uint64_t router_config_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][channel] << 32) | + eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE; + noc_async_read_one_packet( + router_config_addr, + (uint32_t)&routing_table[routing_plane], + sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + routing_plane++; + } + temp_mask >>= 1; + channel++; + } + } + gk_info->routing_planes = routing_plane; + noc_async_read_barrier(); +} + +void kernel_main() { + sync_val = get_arg_val(0); + router_mask = get_arg_val(1); + gk_message_pending = 0; + router_addr = 0; + + tt_fabric_init(); + + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); + + zero_l1_buf((tt_l1_ptr uint32_t*)&gk_info->gk_msg_buf, FVCC_BUF_SIZE_BYTES); + zero_l1_buf((tt_l1_ptr uint32_t*)socket_info, sizeof(socket_info_t)); + + sync_all_routers(); + get_routing_tables(); + uint64_t start_timestamp = get_timestamp(); + + uint32_t loop_count = 0; + uint32_t total_messages_procesed = 0; + volatile ctrl_chan_msg_buf* msg_buf = &gk_info->gk_msg_buf; + while (1) { + if (!gk_msg_buf_is_empty(msg_buf) && gk_msg_valid(msg_buf)) { + uint32_t msg_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + chan_request_entry_t* req = (chan_request_entry_t*)msg_buf->msg_buf + msg_index; + packet_header_t* packet = &req->packet_header; + if (tt_fabric_is_header_valid(packet)) { + total_messages_procesed++; + DPRINT << "GK: Message Received " << (uint32_t)packet->session.command + << " msg num = " << total_messages_procesed << ENDL(); + if (packet->routing.flags == SYNC) { + if (packet->session.command == SOCKET_OPEN) { + DPRINT << "GK: Socket Open " << ENDL(); + socket_open(packet); + } else if (packet->session.command == SOCKET_CLOSE) { + socket_close(packet); + } else if (packet->session.command == SOCKET_CONNECT) { + DPRINT << "GK: Socket Connect " << ENDL(); + socket_connect(packet); + } + } + + // clear the flags field to invalidate pull request slot. + // flags will be set to non-zero by next requestor. + gk_msg_buf_advance_rdptr((ctrl_chan_msg_buf*)msg_buf); + loop_count = 0; + } else { + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_BAD_HEADER); + return; + } + } + + loop_count++; + + process_pending_socket(); + + if (gk_info->router_sync.val == 0) { + // terminate signal from host sw. + if (loop_count >= 0x1000) { + // send terminate to all chip routers. + notify_all_routers(0); + break; + } + } + } + + DPRINT << "Gatekeeper messages processed " << total_messages_procesed << ENDL(); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); + + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); +} diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp index 847002383f2..c7f6e0e34f3 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp @@ -10,16 +10,18 @@ router_state_t router_state __attribute__((aligned(16))); fvc_consumer_state_t fvc_consumer_state __attribute__((aligned(16))); // replicate for each fvc fvc_producer_state_t fvc_producer_state __attribute__((aligned(16))); // replicate for each fvc +fvcc_inbound_state_t fvcc_inbound_state __attribute__((aligned(16))); // inbound fabric virtual control channel +fvcc_outbound_state_t fvcc_outbound_state __attribute__((aligned(16))); // outbound fabric virtual control channel volatile local_pull_request_t local_pull_request_temp __attribute__((aligned(16))); // replicate for each fvc volatile local_pull_request_t* local_pull_request = &local_pull_request_temp; // replicate for each fvc constexpr uint32_t fvc_data_buf_size_words = get_compile_time_arg_val(0); constexpr uint32_t fvc_data_buf_size_bytes = fvc_data_buf_size_words * PACKET_WORD_SIZE_BYTES; -constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(1); -constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(2); -// constexpr uint32_t sync_val = get_compile_time_arg_val(3); -// constexpr uint32_t router_mask = get_compile_time_arg_val(4); -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(3); +constexpr uint32_t gk_message_addr_l = get_compile_time_arg_val(1); +constexpr uint32_t gk_message_addr_h = get_compile_time_arg_val(2); +constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(3); +constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(4); +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(5); uint32_t sync_val; uint32_t router_mask; @@ -48,30 +50,23 @@ uint64_t xy_local_addr; #define SWITCH_THRESHOLD 0x3FF -inline void notify_all_routers() { - uint32_t channel = 0; - uint32_t remaining_cores = router_mask; - // send semaphore increment to all fabric routers on this device. +inline void notify_gatekeeper() { + // send semaphore increment to gatekeeper on this device. // semaphore notifies all other routers that this router has completed // startup handshake with its ethernet peer. - for (uint32_t i = 0; i < 16; i++) { - if (remaining_cores == 0) { - break; - } - if (remaining_cores & (0x1 << i)) { - uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - dest_addr, - NOC_UNICAST_WRITE_VC, - 1, - 31, - false, - false, - MEM_NOC_ATOMIC_RET_VAL_ADDR); - } - } + uint64_t dest_addr = + (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, router_sync); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + dest_addr, + NOC_UNICAST_WRITE_VC, + 1, + 31, + false, + false, + MEM_NOC_ATOMIC_RET_VAL_ADDR); + volatile uint32_t* sync_sem_addr = (volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM; // wait for all device routers to have incremented the sync semaphore. // sync_val is equal to number of tt-fabric routers running on a device. @@ -99,6 +94,8 @@ void kernel_main() { router_state.sync_out = 0; zero_l1_buf((tt_l1_ptr uint32_t*)fvc_consumer_req_buf, sizeof(chan_req_buf)); + zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_IN_BUF_START, FVCC_IN_BUF_SIZE); + zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_OUT_BUF_START, FVCC_OUT_BUF_SIZE); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX, (uint32_t)&router_state); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX + 1, (uint32_t)&fvc_consumer_state); write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX + 1, (uint32_t)&fvc_producer_state); @@ -110,12 +107,19 @@ void kernel_main() { fvc_data_buf_size_words / 2, (uint32_t)&fvc_consumer_state.remote_rdptr); + fvcc_outbound_state.init( + FVCC_OUT_BUF_START, FVCC_SYNC_BUF_START, FVCC_IN_BUF_START, (uint32_t)&fvcc_inbound_state.inbound_wrptr); + fvcc_inbound_state.init( + FVCC_IN_BUF_START, + (uint32_t)&fvcc_outbound_state.remote_rdptr, + (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf)); + if (!wait_all_src_dest_ready(&router_state, timeout_cycles)) { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } - notify_all_routers(); + notify_gatekeeper(); uint64_t start_timestamp = get_timestamp(); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000001); @@ -195,6 +199,9 @@ void kernel_main() { return; } + fvcc_inbound_state.fvcc_handler(); + fvcc_outbound_state.fvcc_handler(); + loop_count++; // need to optimize this.