diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 31e3648d336..972bc721995 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -18,7 +18,6 @@ set(PERF_MICROBENCH_TESTS_SRCS routing/test_vc_loopback_tunnel.cpp routing/test_tt_fabric_sanity.cpp routing/test_tt_fabric_multi_hop_sanity.cpp - routing/test_tt_fabric_socket_sanity.cpp noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp old/matmul/matmul_global_l1.cpp old/matmul/matmul_local_l1.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp deleted file mode 100644 index ea42c57b436..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "debug/dprint.h" -#include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" -// clang-format on - -// seed to re-generate the data and validate against incoming data -constexpr uint32_t prng_seed = get_compile_time_arg_val(0); - -// total data/payload expected -constexpr uint32_t total_data_kb = get_compile_time_arg_val(1); -constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; - -// max packet size to generate mask -constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(2); -static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); - -// fabric command -constexpr uint32_t test_command = get_compile_time_arg_val(3); - -// address to start reading from/poll on -constexpr uint32_t target_address = get_compile_time_arg_val(4); - -// atomic increment for the ATOMIC_INC command -constexpr uint32_t atomic_increment = get_compile_time_arg_val(5); - -constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); -constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); -constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(8); -constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(9); -constexpr uint32_t client_interface_addr = get_compile_time_arg_val(10); -constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(11); -constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(12); -constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(13); - -volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = - reinterpret_cast(client_pull_req_buf_addr); -volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; -uint64_t xy_local_addr; -socket_reader_state socket_reader; - -tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); - -#define PAYLOAD_MASK (0xFFFF0000) - -void kernel_main() { - uint64_t processed_packet_words = 0, num_packets = 0; - volatile tt_l1_ptr uint32_t* poll_addr; - uint32_t poll_val = 0; - bool async_wr_check_failed = false; - - // parse runtime args - uint32_t dest_device = get_arg_val(0); - - tt_fabric_init(); - - zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; - zero_l1_buf( - reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); - test_results[PQ_TEST_MISC_INDEX] = 0xff000001; - zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); - test_results[PQ_TEST_MISC_INDEX] = 0xff000002; - zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); - test_results[PQ_TEST_MISC_INDEX] = 0xff000003; - - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); - client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; - test_results[PQ_TEST_MISC_INDEX] = 0xff000004; - - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(); - - socket_reader.init(data_buffer_start_addr, data_buffer_size_words); - DPRINT << "Socket open on " << dest_device << ENDL(); - test_results[PQ_TEST_MISC_INDEX] = 0xff000005; - - fabric_socket_open( - 3, // the network plane to use for this socket - 2, // Temporal epoch for which the socket is being opened - 1, // Socket Id to open - SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket - SOCKET_DIRECTION_RECV, // Send or Receive - dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. - dest_device & 0xFFFF, - 0 // fabric virtual channel. - ); - test_results[PQ_TEST_MISC_INDEX] = 0xff000006; - - uint32_t loop_count = 0; - uint32_t packet_count = 0; - while (1) { - if (!fvc_req_buf_is_empty(client_pull_req_buf) && fvc_req_valid(client_pull_req_buf)) { - uint32_t req_index = client_pull_req_buf->rdptr.ptr & CHAN_REQ_BUF_SIZE_MASK; - chan_request_entry_t* req = (chan_request_entry_t*)client_pull_req_buf->chan_req + req_index; - pull_request_t* pull_req = &req->pull_request; - if (socket_reader.packet_in_progress == 0) { - DPRINT << "Socket Packet " << packet_count << ENDL(); - } - if (pull_req->flags == FORWARD) { - socket_reader.pull_socket_data(pull_req); - test_results[PQ_TEST_MISC_INDEX] = 0xDD000001; - noc_async_read_barrier(); - update_pull_request_words_cleared(pull_req); - socket_reader.pull_words_in_flight = 0; - socket_reader.push_socket_data(); - } - - if (socket_reader.packet_in_progress == 1 and socket_reader.packet_words_remaining == 0) { - // wait for any pending sockat data writes to finish. - test_results[PQ_TEST_MISC_INDEX] = 0xDD000002; - - noc_async_write_barrier(); - - test_results[PQ_TEST_MISC_INDEX] = 0xDD000003; - // clear the flags field to invalidate pull request slot. - // flags will be set to non-zero by next requestor. - req_buf_advance_rdptr((chan_req_buf*)client_pull_req_buf); - socket_reader.packet_in_progress = 0; - packet_count++; - loop_count = 0; - } - } - test_results[PQ_TEST_MISC_INDEX] = 0xDD400000 | (loop_count & 0xfffff); - - loop_count++; - if (packet_count > 0 and loop_count >= 0x10000) { - DPRINT << "Socket Rx Finished" << packet_count << ENDL(); - break; - } - } - - // write out results - set_64b_result(test_results, processed_packet_words, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); - - if (async_wr_check_failed) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; - } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = 0xff000005; - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index b150e944b71..be7520fd4b7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -8,7 +8,6 @@ #include "tt_fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" #include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" // clang-format on uint32_t src_endpoint_id; @@ -21,40 +20,41 @@ constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); +// constexpr uint32_t router_x = get_compile_time_arg_val(6); +// constexpr uint32_t router_y = get_compile_time_arg_val(7); -constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); -constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); -constexpr uint32_t prng_seed = get_compile_time_arg_val(8); +constexpr uint32_t prng_seed = get_compile_time_arg_val(10); -constexpr uint32_t total_data_kb = get_compile_time_arg_val(9); +constexpr uint32_t total_data_kb = get_compile_time_arg_val(11); constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; -constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(10); +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(12); static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(11); +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(13); -constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(12); +constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(14); constexpr pkt_dest_size_choices_t pkt_dest_size_choice = - static_cast(get_compile_time_arg_val(13)); + static_cast(get_compile_time_arg_val(15)); -constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(14); -constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(15); -constexpr uint32_t test_command = get_compile_time_arg_val(16); +constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(16); +constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(17); +constexpr uint32_t test_command = get_compile_time_arg_val(18); -uint32_t base_target_address = get_compile_time_arg_val(17); +uint32_t base_target_address = get_compile_time_arg_val(19); // atomic increment for the ATOMIC_INC command -constexpr uint32_t atomic_increment = get_compile_time_arg_val(18); +constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); // constexpr uint32_t dest_device = get_compile_time_arg_val(21); uint32_t dest_device; -constexpr uint32_t signal_address = get_compile_time_arg_val(19); -constexpr uint32_t client_interface_addr = get_compile_time_arg_val(20); +constexpr uint32_t signal_address = get_compile_time_arg_val(21); uint32_t max_packet_size_mask; @@ -62,10 +62,8 @@ auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); tt_l1_ptr volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table = reinterpret_cast(routing_table_start_addr); -volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; fvc_producer_state_t test_producer __attribute__((aligned(16))); -fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); uint64_t xy_local_addr; @@ -75,9 +73,6 @@ uint32_t target_address; uint32_t noc_offset; uint32_t rx_addr_hi; -uint32_t gk_interface_addr_l; -uint32_t gk_interface_addr_h; - // generates packets with random size and payload on the input side inline bool test_buffer_handler_async_wr() { if (input_queue_state.all_packets_done()) { @@ -248,101 +243,27 @@ inline bool test_buffer_handler_atomic_inc() { return false; } -inline bool test_buffer_handler_fvcc() { - if (input_queue_state.all_packets_done()) { - return true; - } - - uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; - if (free_words < PACKET_HEADER_SIZE_WORDS) { - return false; - } - - uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); - uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); - uint32_t words_initialized = 0; - while (words_initialized < words_to_init) { - if (input_queue_state.all_packets_done()) { - break; - } - - if (!input_queue_state.packet_active()) { // start of a new packet - input_queue_state.next_inline_packet(total_data_words); - - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - - packet_header.routing.flags = SYNC; - packet_header.routing.dst_mesh_id = dest_device >> 16; - packet_header.routing.dst_dev_id = dest_device & 0xFFFF; - packet_header.routing.src_dev_id = routing_table->my_device_id; - packet_header.routing.src_mesh_id = routing_table->my_mesh_id; - packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - packet_header.session.command = ASYNC_WR_RESP; - packet_header.session.target_offset_l = target_address; - packet_header.session.target_offset_h = noc_offset; - packet_header.packet_parameters.misc_parameters.words[1] = 0; - packet_header.packet_parameters.misc_parameters.words[2] = 0; - tt_fabric_add_header_checksum(&packet_header); - uint32_t words_left = words_to_init - words_initialized; - bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; - if (split_header) { - header_words_to_init = words_left; - } - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i]; - } - - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } else { - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - uint32_t header_words_initialized = - input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; - uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; - uint32_t words_left = words_to_init - words_initialized; - header_words_to_init = std::min(words_left, header_words_to_init); - - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; - } - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } - } - fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); - return false; -} - bool test_buffer_handler() { if constexpr (test_command == ASYNC_WR) { return test_buffer_handler_async_wr(); } else if constexpr (test_command == ATOMIC_INC) { return test_buffer_handler_atomic_inc(); - } else if constexpr (test_command == ASYNC_WR_RESP) { - return test_buffer_handler_fvcc(); } } void kernel_main() { tt_fabric_init(); - uint32_t rt_args_idx = 0; // TODO: refactor - src_endpoint_id = get_arg_val(rt_args_idx++); - noc_offset = get_arg_val(rt_args_idx++); - uint32_t router_x = get_arg_val(rt_args_idx++); - uint32_t router_y = get_arg_val(rt_args_idx++); - dest_device = get_arg_val(rt_args_idx++); - uint32_t rx_buf_size = get_arg_val(rt_args_idx++); - gk_interface_addr_l = get_arg_val(rt_args_idx++); - gk_interface_addr_h = get_arg_val(rt_args_idx++); + src_endpoint_id = get_arg_val(0); + noc_offset = get_arg_val(1); + uint32_t router_x = get_arg_val(2); + uint32_t router_y = get_arg_val(3); + dest_device = get_arg_val(4); + uint32_t rx_buf_size = get_arg_val(5); if (ASYNC_WR == test_command) { - base_target_address = get_arg_val(rt_args_idx++); + base_target_address = get_arg_val(6); } target_address = base_target_address; rx_addr_hi = base_target_address + rx_buf_size; @@ -363,9 +284,6 @@ void kernel_main() { zero_l1_buf(reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); - zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); - client_interface->gk_msg_buf_addr = - (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { input_queue_state.init(src_endpoint_id, prng_seed); @@ -376,7 +294,6 @@ void kernel_main() { } test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); - fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); uint32_t temp = max_packet_size_words; max_packet_size_mask = 0; @@ -447,15 +364,6 @@ void kernel_main() { DPRINT << "Packet Header Corrupted: packet " << packet_count << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); break; - } else if (fvcc_test_producer.get_curr_packet_valid()) { - fvcc_test_producer.fvcc_handler(); -#ifdef CHECK_TIMEOUT - progress_timestamp = get_timestamp_32b(); -#endif - } else if (fvcc_test_producer.packet_corrupted) { - DPRINT << "Packet Header Corrupted: packet " << packet_count - << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); - break; } else if (all_packets_initialized) { DPRINT << "all packets done" << ENDL(); break; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp deleted file mode 100644 index dfd9a66f274..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ /dev/null @@ -1,505 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" -// clang-format on - -uint32_t src_endpoint_id; -// constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); -constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); -static_assert(is_power_of_2(num_dest_endpoints), "num_dest_endpoints must be a power of 2"); -constexpr uint32_t dest_endpoint_start_id = get_compile_time_arg_val(2); - -constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); -constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); - -constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); -constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(6); -constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(7); - -constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); -constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); - -tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); - -constexpr uint32_t prng_seed = get_compile_time_arg_val(10); - -constexpr uint32_t total_data_kb = get_compile_time_arg_val(11); -constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; - -constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(12); - -static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); - -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(13); - -constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(14); -constexpr pkt_dest_size_choices_t pkt_dest_size_choice = - static_cast(get_compile_time_arg_val(15)); - -constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(16); -constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(17); -constexpr uint32_t test_command = get_compile_time_arg_val(18); - -constexpr uint32_t base_target_address = get_compile_time_arg_val(19); -uint32_t target_address = base_target_address; - -// atomic increment for the ATOMIC_INC command -constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); -// constexpr uint32_t dest_device = get_compile_time_arg_val(21); -uint32_t dest_device; - -constexpr uint32_t signal_address = get_compile_time_arg_val(21); -constexpr uint32_t client_interface_addr = get_compile_time_arg_val(22); -constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(23); - -uint32_t max_packet_size_mask; - -auto input_queue_state = select_input_queue(); -volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); -volatile tt_l1_ptr tt::tt_fabric::fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_start_addr); -volatile tt_fabric_client_interface_t* client_interface = (volatile tt_fabric_client_interface_t*)client_interface_addr; -volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = - reinterpret_cast(client_pull_req_buf_addr); - -fvc_producer_state_t test_producer __attribute__((aligned(16))); -fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); - -uint64_t xy_local_addr; - -packet_header_t packet_header __attribute__((aligned(16))); - -uint32_t noc_offset; - -// generates packets with random size and payload on the input side -inline bool test_buffer_handler_dsocket_wr(socket_handle_t* socket_handle) { - if (input_queue_state.all_packets_done()) { - return true; - } - - uint32_t free_words = test_producer.get_num_words_free(); - if (free_words < PACKET_HEADER_SIZE_WORDS) { - return false; - } - - // Each call to test_buffer_handler initializes only up to the end - // of the producer buffer. Since the header is 3 words, we need to handle - // split header cases, where the buffer has not enough space for the full header. - // In this case, we write as many words as space available, and remaining header words - // are written on next call. - uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); - uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); - uint32_t words_initialized = 0; - while (words_initialized < words_to_init) { - if (input_queue_state.all_packets_done()) { - break; - } - - if (!input_queue_state.packet_active()) { // start of a new packet - input_queue_state.next_packet( - num_dest_endpoints, - dest_endpoint_start_id, - max_packet_size_words, - max_packet_size_mask, - total_data_words); - - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - - packet_header.routing.flags = FORWARD; - packet_header.routing.packet_size_bytes = input_queue_state.curr_packet_size_words * PACKET_WORD_SIZE_BYTES; - packet_header.routing.dst_mesh_id = dest_device >> 16; - packet_header.routing.dst_dev_id = dest_device & 0xFFFF; - packet_header.session.command = DSOCKET_WR; - packet_header.session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; - packet_header.session.target_offset_h = socket_handle->pull_notification_adddr >> 32; - target_address += packet_header.routing.packet_size_bytes - PACKET_HEADER_SIZE_BYTES; - packet_header.packet_parameters.misc_parameters.words[1] = input_queue_state.packet_rnd_seed; - tt_fabric_add_header_checksum(&packet_header); - uint32_t words_left = words_to_init - words_initialized; - bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; - if (split_header) { - header_words_to_init = words_left; - } - - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i]; - } - - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } else { - uint32_t words_remaining = words_to_init - words_initialized; - uint32_t packet_words_initialized = - input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; - if (packet_words_initialized < PACKET_HEADER_SIZE_WORDS) { - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - uint32_t header_words_initialized = packet_words_initialized; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; - uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; - header_words_to_init = std::min(words_remaining, header_words_to_init); - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; - } - words_initialized += header_words_to_init; - words_remaining = words_to_init - words_initialized; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - if (words_remaining == 0) { - // no space left for packet data. - break; - } - } - - uint32_t num_words = std::min(words_remaining, input_queue_state.curr_packet_words_remaining); - if constexpr (!skip_pkt_content_gen) { - uint32_t start_val = (input_queue_state.packet_rnd_seed & 0xFFFF0000) + - (input_queue_state.curr_packet_size_words - - input_queue_state.curr_packet_words_remaining - PACKET_HEADER_SIZE_WORDS); - fill_packet_data(reinterpret_cast(byte_wr_addr), num_words, start_val); - } - words_initialized += num_words; - input_queue_state.curr_packet_words_remaining -= num_words; - byte_wr_addr += num_words * PACKET_WORD_SIZE_BYTES; - } - } - test_producer.advance_local_wrptr(words_initialized); - return false; -} - -// generates packets with random size and payload on the input side -inline bool test_buffer_handler_atomic_inc() { - if (input_queue_state.all_packets_done()) { - return true; - } - - uint32_t free_words = test_producer.get_num_words_free(); - if (free_words < PACKET_HEADER_SIZE_WORDS) { - return false; - } - - uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); - uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); - uint32_t words_initialized = 0; - while (words_initialized < words_to_init) { - if (input_queue_state.all_packets_done()) { - break; - } - - if (!input_queue_state.packet_active()) { // start of a new packet - input_queue_state.next_inline_packet(total_data_words); - - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - - packet_header.routing.flags = INLINE_FORWARD; - packet_header.routing.dst_mesh_id = dest_device >> 16; - packet_header.routing.dst_dev_id = dest_device & 0xFFFF; - packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - packet_header.session.command = ATOMIC_INC; - packet_header.session.target_offset_l = target_address; - packet_header.session.target_offset_h = noc_offset; - packet_header.packet_parameters.atomic_parameters.wrap_boundary = 31; - packet_header.packet_parameters.atomic_parameters.increment = atomic_increment; - tt_fabric_add_header_checksum(&packet_header); - uint32_t words_left = words_to_init - words_initialized; - bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; - if (split_header) { - header_words_to_init = words_left; - } - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i]; - } - - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } else { - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - uint32_t header_words_initialized = - input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; - uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; - uint32_t words_left = words_to_init - words_initialized; - header_words_to_init = std::min(words_left, header_words_to_init); - - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; - } - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } - } - test_producer.advance_local_wrptr(words_initialized); - return false; -} - -inline bool test_buffer_handler_fvcc() { - if (input_queue_state.all_packets_done()) { - return true; - } - - uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; - if (free_words < PACKET_HEADER_SIZE_WORDS) { - return false; - } - - uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); - uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); - uint32_t words_initialized = 0; - while (words_initialized < words_to_init) { - if (input_queue_state.all_packets_done()) { - break; - } - - if (!input_queue_state.packet_active()) { // start of a new packet - input_queue_state.next_inline_packet(total_data_words); - - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - - packet_header.routing.flags = SYNC; - packet_header.routing.dst_mesh_id = dest_device >> 16; - packet_header.routing.dst_dev_id = dest_device & 0xFFFF; - packet_header.routing.src_dev_id = routing_table->my_device_id; - packet_header.routing.src_mesh_id = routing_table->my_mesh_id; - packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - packet_header.session.command = ASYNC_WR_RESP; - packet_header.session.target_offset_l = target_address; - packet_header.session.target_offset_h = noc_offset; - packet_header.packet_parameters.misc_parameters.words[1] = 0; - packet_header.packet_parameters.misc_parameters.words[2] = 0; - - tt_fabric_add_header_checksum(&packet_header); - uint32_t words_left = words_to_init - words_initialized; - bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; - if (split_header) { - header_words_to_init = words_left; - } - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i]; - } - - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } else { - tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); - uint32_t header_words_initialized = - input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; - uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; - uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; - uint32_t words_left = words_to_init - words_initialized; - header_words_to_init = std::min(words_left, header_words_to_init); - - for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { - header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; - } - words_initialized += header_words_to_init; - input_queue_state.curr_packet_words_remaining -= header_words_to_init; - byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; - } - } - fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); - return false; -} - -bool test_buffer_handler(socket_handle_t* socket_handle) { - if constexpr (test_command == DSOCKET_WR) { - return test_buffer_handler_dsocket_wr(socket_handle); - } else if constexpr (test_command == ATOMIC_INC) { - return test_buffer_handler_atomic_inc(); - } else if constexpr (test_command == SOCKET_OPEN) { - return test_buffer_handler_fvcc(); - } - return true; -} - -void kernel_main() { - tt_fabric_init(); - - // TODO: refactor - src_endpoint_id = get_arg_val(0); - noc_offset = get_arg_val(1); - uint32_t router_x = get_arg_val(2); - uint32_t router_y = get_arg_val(3); - dest_device = get_arg_val(4); - - if (ASYNC_WR == test_command) { - target_address = get_arg_val(5); - } - - uint64_t router_config_addr = NOC_XY_ADDR(router_x, router_y, eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet( - router_config_addr, routing_table_start_addr, sizeof(tt::tt_fabric::fabric_router_l1_config_t)); - noc_async_read_barrier(); - - zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; - - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; - test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; - - zero_l1_buf( - reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); - zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); - zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); - zero_l1_buf((uint32_t*)client_interface, sizeof(tt_fabric_client_interface_t)); - zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); - client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; - - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(); - - if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { - input_queue_state.init(src_endpoint_id, prng_seed); - } else if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE) { - input_queue_state.init(max_packet_size_words, 0); - } else { - input_queue_state.init(src_endpoint_id, prng_seed); - } - - test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); - fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); - - uint32_t temp = max_packet_size_words; - max_packet_size_mask = 0; - temp >>= 1; - while (temp) { - max_packet_size_mask = (max_packet_size_mask << 1) + 1; - temp >>= 1; - } - if ((max_packet_size_mask + 1) != max_packet_size_words) { - // max_packet_size_words is not a power of 2 - // snap to next power of 2 mask - max_packet_size_mask = (max_packet_size_mask << 1) + 1; - } - - // wait till test sends start signal. This is set by test - // once tt_fabric kernels have been launched on all the test devices. - while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); - - test_results[PQ_TEST_MISC_INDEX] = 0xff000001; - - uint64_t data_words_sent = 0; - uint64_t iter = 0; - uint64_t zero_data_sent_iter = 0; - uint64_t few_data_sent_iter = 0; - uint64_t many_data_sent_iter = 0; - uint64_t words_flushed = 0; - bool timeout = false; - uint64_t start_timestamp = get_timestamp(); - uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; - - uint32_t curr_packet_size = 0; - uint32_t curr_packet_words_sent = 0; - uint32_t packet_count = 0; - - socket_handle_t* socket_handle = fabric_socket_open( - 3, // the network plane to use for this socket - 2, // Temporal epoch for which the socket is being opened - 1, // Socket Id to open - SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket - SOCKET_DIRECTION_SEND, // Send or Receive - dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. - dest_device & 0xFFFF, - 0 // fabric virtual channel. - ); - - fabric_socket_connect(socket_handle); - DPRINT << "Socket 1 connected. Handle = " << (uint32_t)socket_handle << ENDL(); - - while (true) { - iter++; -#ifdef CHECK_TIMEOUT - if (timeout_cycles > 0) { - uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; - if (cycles_since_progress > timeout_cycles) { - timeout = true; - break; - } - } -#endif - - bool all_packets_initialized = test_buffer_handler(socket_handle); - - if (test_producer.get_curr_packet_valid()) { - curr_packet_size = - (test_producer.current_packet_header.routing.packet_size_bytes + PACKET_WORD_SIZE_BYTES - 1) >> 4; - uint32_t curr_data_words_sent = test_producer.pull_data_from_fvc_buffer(); - curr_packet_words_sent += curr_data_words_sent; - data_words_sent += curr_data_words_sent; - if constexpr (!(data_sent_per_iter_low == 0 && data_sent_per_iter_high == 0)) { - zero_data_sent_iter += static_cast(curr_data_words_sent <= 0); - few_data_sent_iter += static_cast(curr_data_words_sent <= data_sent_per_iter_low); - many_data_sent_iter += static_cast(curr_data_words_sent >= data_sent_per_iter_high); - } -#ifdef CHECK_TIMEOUT - progress_timestamp = (curr_data_words_sent > 0) ? get_timestamp_32b() : progress_timestamp; -#endif - if (curr_packet_words_sent == curr_packet_size) { - curr_packet_words_sent = 0; - packet_count++; - } - } else if (test_producer.packet_corrupted) { - DPRINT << "Packet Header Corrupted: packet " << packet_count - << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); - break; - } else if (fvcc_test_producer.get_curr_packet_valid()) { - fvcc_test_producer.fvcc_handler(); -#ifdef CHECK_TIMEOUT - progress_timestamp = get_timestamp_32b(); -#endif - } else if (fvcc_test_producer.packet_corrupted) { - DPRINT << "Packet Header Corrupted: packet " << packet_count - << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); - break; - } else if (all_packets_initialized) { - DPRINT << "all packets done" << ENDL(); - break; - } - } - - /* - fabric_socket_open(3, 4, 2, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & 0xFFFF, - 0); fabric_socket_open(3, 6, 3, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & - 0xFFFF, 0); fabric_socket_open(3, 8, 4, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device - & 0xFFFF, 0); fabric_socket_open(3, 10, 5, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, - dest_device & 0xFFFF, 0); - */ - uint64_t cycles_elapsed = get_timestamp() - start_timestamp; - - uint64_t num_packets = input_queue_state.get_num_packets(); - set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); - set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); - set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); - set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); - set_64b_result(test_results, zero_data_sent_iter, TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); - set_64b_result(test_results, few_data_sent_iter, TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); - set_64b_result(test_results, many_data_sent_iter, TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); - - if (test_producer.packet_corrupted) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_BAD_HEADER; - test_results[PQ_TEST_MISC_INDEX] = packet_count; - } else if (!timeout) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = packet_count; - } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; - set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); - } -} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 97bb28bd93f..352a3d107eb 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -23,8 +23,6 @@ int main(int argc, char** argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; - constexpr uint32_t default_gk_x = 0; - constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -157,8 +155,6 @@ int main(int argc, char** argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); - uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); @@ -223,8 +219,6 @@ int main(int argc, char** argv) { bool pass = true; - CoreCoord gk_core = {gk_x, gk_y}; - std::map defines = { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; @@ -295,15 +289,6 @@ int main(int argc, char** argv) { bool router_core_found = false; CoreCoord router_logical_core; CoreCoord router_phys_core; - CoreCoord gk_phys_core; - uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t gk_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - uint32_t client_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); - log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); - log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); - log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); - for (auto device : device_map) { auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.second->id()); std::vector device_router_cores; @@ -337,25 +322,17 @@ int main(int argc, char** argv) { log_info(LogTest, "Device {} router_mask = 0x{:04X}", device.first, router_mask); uint32_t sem_count = device_router_cores.size(); device_router_map[device.first] = device_router_phys_cores; - - gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); - uint32_t gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); - - std::vector router_compile_args = { - (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words - tunneler_test_results_addr, // 1: test_results_addr - tunneler_test_results_size, // 2: test_results_size - 0, // 3: timeout_cycles - }; - - std::vector router_runtime_args = { - sem_count, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - gk_interface_addr, // 2: gk_message_addr_l - gk_noc_offset, // 3: gk_message_addr_h + std::vector runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask }; - for (auto logical_core : device_router_cores) { + std::vector router_compile_args = { + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + tunneler_test_results_addr, // 1: test_results_addr + tunneler_test_results_size, // 2: test_results_size + 0, // 3: timeout_cycles + }; auto router_kernel = tt_metal::CreateKernel( program_map[device.first], "tt_fabric/impl/kernels/tt_fabric_router.cpp", @@ -363,7 +340,7 @@ int main(int argc, char** argv) { tt_metal::EthernetConfig{ .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); - tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, router_runtime_args); + tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, runtime_args); log_debug( LogTest, @@ -371,33 +348,6 @@ int main(int argc, char** argv) { device.first, device.second->ethernet_core_from_logical_core(logical_core)); } - // setup runtime args - log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); - std::vector gk_compile_args = { - gk_interface_addr, // 0: - socket_info_addr, // 1: - routing_table_addr, // 2 - test_results_addr, // 3: test_results_addr - test_results_size, // 4: test_results_size - 0, // 5: timeout_cycles - }; - - std::vector gk_runtime_args = { - sem_count, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - }; - - auto kernel = tt_metal::CreateKernel( - program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", - {gk_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = gk_compile_args, - .defines = defines}); - - tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, gk_runtime_args); } if (check_txrx_timeout) { @@ -408,7 +358,6 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); - CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); std::vector compile_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id num_dest_endpoints, // 1: num_dest_endpoints @@ -416,36 +365,32 @@ int main(int argc, char** argv) { tx_queue_start_addr, // 3: queue_start_addr_words (tx_queue_size_bytes >> 4), // 4: queue_size_words routing_table_start_addr, // 5: routeing table - test_results_addr, // 6: test_results_addr - test_results_size, // 7: test_results_size - prng_seed, // 8: prng_seed - data_kb_per_tx, // 9: total_data_kb - max_packet_size_words, // 10: max_packet_size_words - timeout_mcycles * 1000 * 1000 * 4, // 11: timeout_cycles - tx_skip_pkt_content_gen, // 12: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 13: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 14: data_sent_per_iter_low - tx_data_sent_per_iter_high, // 15: data_sent_per_iter_high - fabric_command, // 16: fabric command - target_address, // 17: - atomic_increment, // 18: - tx_signal_address, // 19: - client_interface_addr, + 0, // 6: router_x + 0, // 7: router_y + test_results_addr, // 8: test_results_addr + test_results_size, // 9: test_results_size + prng_seed, // 10: prng_seed + data_kb_per_tx, // 11: total_data_kb + max_packet_size_words, // 12: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles + tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high + fabric_command, // 18: fabric command + target_address, + atomic_increment, + tx_signal_address }; // setup runtime args - uint32_t tx_gk_noc_offset = tt_metal::hal.noc_xy_encoding(tx_gk_phys_core.x, tx_gk_phys_core.y); std::vector runtime_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id 0x410, // 1: dest_noc_offset - router_phys_core.x, // 2: router_x - router_phys_core.y, // 3: router_y - (dev_r_mesh_id << 16 | dev_r_chip_id), // 4: mesh and chip id - 0xd0000, // 5: space in rx's L1 - gk_interface_addr, // 6: gk_message_addr_l - tx_gk_noc_offset, // 7: gk_message_addr_h - }; + router_phys_core.x, + router_phys_core.y, + (dev_r_mesh_id << 16 | dev_r_chip_id)}; if (ASYNC_WR == fabric_command) { runtime_args.push_back(target_address); @@ -476,12 +421,10 @@ int main(int argc, char** argv) { // Initialize tt_fabric router sync semaphore to 0. Routers on each device // increment once handshake with ethernet peer has been completed. std::vector zero_buf(1, 0); - std::vector gk_zero_buf(12, 0); for (auto [device_id, router_phys_cores] : device_router_map) { for (auto phys_core : router_phys_cores) { tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); } - tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); } for (uint32_t i = 0; i < num_src_endpoints; i++) { @@ -513,10 +456,11 @@ int main(int argc, char** argv) { log_info(LogTest, "Tx Finished"); - // terminate gatekeeper. - // Gatekeeper will signal all routers on the device to terminate. + // terminate fabric routers for (auto [device_id, router_phys_cores] : device_router_map) { - tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); + for (auto phys_core : router_phys_cores) { + tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); + } } // wait for all kernels to finish. diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 6cac07adec8..d5a758454f5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -31,16 +31,6 @@ std::mt19937 global_rng; // decides if the tx puts the data directly on eth or if a noc hop is allowed as well bool allow_1st_noc_hop = false; -// Gatekeeper kernel coordinates -uint32_t gk_x, gk_y; - -// Check if gatekeeper runs on tensix worker or idle ethernet based on the board type -bool run_gk_on_idle_ethernet; - -uint32_t routing_table_addr; -uint32_t gk_interface_addr; -uint32_t socket_info_addr; - inline std::vector get_random_numbers_from_range(uint32_t start, uint32_t end, uint32_t count) { std::vector range(end - start + 1); @@ -111,11 +101,6 @@ typedef struct test_board { } else { physical_chip_ids = available_chip_ids; } - - // gatekeeper - run on idle ethernet for n300/T3K - if (("n300" == board_type_) || ("t3k" == board_type_)) { - run_gk_on_idle_ethernet = true; - } } void _init_galaxy_board(uint32_t num_chips) { @@ -346,13 +331,10 @@ typedef struct test_device { std::vector worker_cores; std::vector router_logical_cores; std::vector router_physical_cores; - CoreCoord gk_logical_core; - CoreCoord gk_phys_core; mesh_id_t mesh_id; chip_id_t logical_chip_id; uint32_t mesh_chip_id = 0; uint32_t router_mask = 0; - uint32_t gk_noc_offset; test_device(chip_id_t chip_id_, test_board_t* board_handle_) { physical_chip_id = chip_id_; @@ -385,21 +367,6 @@ typedef struct test_device { router_mask += 0x1 << logical_core.y; } } - - // gatekeeper - if (run_gk_on_idle_ethernet) { - auto idle_eth_cores = device_handle->get_inactive_ethernet_cores(); - if (idle_eth_cores.size() == 0) { - throw std::runtime_error("No idle ethernet cores found on the device"); - } - - gk_logical_core = *idle_eth_cores.begin(); - gk_phys_core = device_handle->ethernet_core_from_logical_core(gk_logical_core); - } else { - gk_logical_core = {gk_x, gk_y}; - gk_phys_core = device_handle->worker_core_from_logical_core(gk_logical_core); - } - gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); } void create_router_kernels(std::vector& compile_args, std::map& defines) { @@ -409,10 +376,8 @@ typedef struct test_device { for (auto i = 0; i < num_routers; i++) { // setup run time args std::vector runtime_args = { - num_routers, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - gk_interface_addr, // 2: gk_message_addr_l - gk_noc_offset, // 3: gk_message_addr_h + num_routers, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask }; // initialize the semaphore @@ -430,45 +395,6 @@ typedef struct test_device { } } - void create_gatekeeper_kernel(std::vector& compile_args, std::map& defines) { - uint32_t num_routers = router_logical_cores.size(); - std::vector zero_buf(12, 0); - - std::vector runtime_args = { - num_routers, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - }; - - // initialize the semaphore - tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); - - KernelHandle kernel; - - if (run_gk_on_idle_ethernet) { - kernel = tt_metal::CreateKernel( - program_handle, - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", - {gk_logical_core}, - tt_metal::EthernetConfig{ - .eth_mode = Eth::IDLE, - .noc = tt_metal::NOC::NOC_0, - .compile_args = compile_args, - .defines = defines}); - } else { - kernel = tt_metal::CreateKernel( - program_handle, - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", - {gk_logical_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - } - - tt_metal::SetRuntimeArgs(program_handle, kernel, gk_logical_core, runtime_args); - } - void terminate_router_kernels() { std::vector zero_buf(1, 0); for (auto& core : router_physical_cores) { @@ -476,11 +402,6 @@ typedef struct test_device { } } - void terminate_gatekeeper_kernel() { - std::vector zero_buf(12, 0); - tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); - } - std::vector select_random_worker_cores(uint32_t count) { std::vector result; @@ -609,9 +530,7 @@ typedef struct test_traffic { router_phys_core.x, // 2: router_x router_phys_core.y, // 3: router_y mesh_chip_id, // 4: mesh and chip id - rx_buf_size, // 5: space in rx's L1 - gk_interface_addr, // 6: gk_message_addr_l - tx_device->gk_noc_offset, // 7: gk_message_addr_h + rx_buf_size // 5: space in rx's L1 }; if (ASYNC_WR == fabric_command) { @@ -912,8 +831,6 @@ int main(int argc, char **argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; - constexpr uint32_t default_gk_x = 0; - constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -1022,8 +939,6 @@ int main(int argc, char **argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); - gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); uint32_t max_packet_size_words = test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); @@ -1192,15 +1107,7 @@ int main(int argc, char **argv) { throw std::runtime_error("Test cannot run on specified device."); } */ - if (run_gk_on_idle_ethernet) { - routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); - } else { - routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - } - gk_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); - - // create router kernels + // launch router kernels std::vector router_compile_args = { (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words tunneler_test_results_addr, // 1: test_results_addr @@ -1211,26 +1118,10 @@ int main(int argc, char **argv) { test_device->create_router_kernels(router_compile_args, defines); } - // create gatekeeper kernel - std::vector gatekeeper_compile_args = { - gk_interface_addr, // 0: gk info addr - socket_info_addr, // 1: - routing_table_addr, // 2: - test_results_addr, // 3: test_results_addr - test_results_size, // 4: test_results_size - 0, // 5: timeout_cycles - }; - for (auto& [chip_id, test_device] : test_devices) { - test_device->create_gatekeeper_kernel(gatekeeper_compile_args, defines); - } - if (check_txrx_timeout) { defines["CHECK_TIMEOUT"] = ""; } - uint32_t client_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(tt_fabric_client_interface_t); - std::vector tx_compile_args = { 0, //(device->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id num_dest_endpoints, // 1: num_dest_endpoints @@ -1238,22 +1129,22 @@ int main(int argc, char **argv) { tx_queue_start_addr, // 3: queue_start_addr_words (tx_queue_size_bytes >> 4), // 4: queue_size_words routing_table_start_addr, // 5: routeing table - test_results_addr, // 6: test_results_addr - test_results_size, // 7: test_results_size - prng_seed, // 8: prng_seed - data_kb_per_tx, // 9: total_data_kb - max_packet_size_words, // 10: max_packet_size_words - timeout_mcycles * 1000 * 1000 * 4, // 11: timeout_cycles - tx_skip_pkt_content_gen, // 12: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 13: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 14: data_sent_per_iter_low - tx_data_sent_per_iter_high, // 15: data_sent_per_iter_high - fabric_command, // 16: fabric_command - target_address, // 17: target_address - atomic_increment, // 18: atomic_increment - tx_signal_address, // 19: tx_signal_address - client_interface_addr, // 20: - client_pull_req_buf_addr, // 21: + 0, // tunneler_phys_core.x, // 6: router_x + 0, // tunneler_phys_core.y, // 7: router_y + test_results_addr, // 8: test_results_addr + test_results_size, // 9: test_results_size + prng_seed, // 10: prng_seed + data_kb_per_tx, // 11: total_data_kb + max_packet_size_words, // 12: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles + tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high + fabric_command, // 18: fabric_command + target_address, // 19: target_address + atomic_increment, // 20: atomic_increment + tx_signal_address // 21: tx_signal_address }; std::vector rx_compile_args = { @@ -1298,7 +1189,7 @@ int main(int argc, char **argv) { // terminate fabric routers for (auto& [chip_id, test_device] : test_devices) { - test_device->terminate_gatekeeper_kernel(); + test_device->terminate_router_kernels(); } // wait for programs to exit diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp deleted file mode 100644 index a8eeb9473d5..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ /dev/null @@ -1,658 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include -#include -#include "tt_fabric/control_plane.hpp" -// #include "tt_metal/impl/dispatch/cq_commands.hpp" -// #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/tt_fabric_traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" -#include "eth_l1_address_map.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" - -using std::vector; -using namespace tt; -using json = nlohmann::json; - -int main(int argc, char** argv) { - constexpr uint32_t default_tx_x = 0; - constexpr uint32_t default_tx_y = 0; - constexpr uint32_t default_rx_x = 0; - constexpr uint32_t default_rx_y = 3; - constexpr uint32_t default_gk_x = 0; - constexpr uint32_t default_gk_y = 9; - - constexpr uint32_t default_mux_x = 0; - constexpr uint32_t default_mux_y = 1; - constexpr uint32_t default_demux_x = 0; - constexpr uint32_t default_demux_y = 2; - - constexpr uint32_t default_prng_seed = 0x100; - constexpr uint32_t default_data_kb_per_tx = 1024 * 1024; - constexpr uint32_t default_max_packet_size_words = 0x100; - - constexpr uint32_t default_routing_table_start_addr = 0x7EC00; - constexpr uint32_t default_tx_queue_start_addr = 0x80000; - constexpr uint32_t default_tx_queue_size_bytes = 0x10000; - constexpr uint32_t default_rx_queue_start_addr = 0x80000; - constexpr uint32_t default_rx_queue_size_bytes = 0x10000; - constexpr uint32_t default_tx_signal_address = 0x70000; - - constexpr uint32_t default_test_results_addr = 0x100000; - constexpr uint32_t default_test_results_size = 0x40000; - - // TODO: use eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE which should be 0x19900, set test results size back - // to 0x7000 - constexpr uint32_t default_tunneler_queue_size_bytes = - 0x4000; // max fvc size (send + receive. top half is send buffer, bottom half is receive buffer) - constexpr uint32_t default_tunneler_test_results_addr = - 0x39000; // 0x8000 * 4 + 0x19000; 0x10000 * 4 + 0x19000 = 0x59000 > 0x40000 (256kB) - constexpr uint32_t default_tunneler_test_results_size = 0x6000; // 256kB total L1 in ethernet core - 0x39000 - - constexpr uint32_t default_timeout_mcycles = 1000; - constexpr uint32_t default_rx_disable_data_check = 0; - constexpr uint32_t default_rx_disable_header_check = 0; - constexpr uint32_t default_tx_skip_pkt_content_gen = 0; - constexpr uint32_t default_check_txrx_timeout = 1; - - constexpr uint32_t src_endpoint_start_id = 4; - constexpr uint32_t dest_endpoint_start_id = 11; - - constexpr uint32_t num_endpoints = 1; - constexpr uint32_t num_src_endpoints = num_endpoints; - constexpr uint32_t num_dest_endpoints = num_endpoints; - - constexpr uint8_t default_tx_pkt_dest_size_choice = 0; // pkt_dest_size_choices_t - - constexpr uint32_t default_tx_data_sent_per_iter_low = 20; - constexpr uint32_t default_tx_data_sent_per_iter_high = 240; - - constexpr uint32_t default_fabric_command = 1; - - constexpr uint32_t default_dump_stat_json = 0; - constexpr const char* default_output_dir = "/tmp"; - - constexpr uint32_t default_test_device_id_l = 0; - constexpr uint32_t default_test_device_id_r = -1; - - constexpr uint32_t default_target_address = 0x30000; - - constexpr uint32_t default_atomic_increment = 4; - - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { - log_info(LogTest, "Usage:"); - log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); - log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); - log_info( - LogTest, - " --max_packet_size_words: Max packet size in words, default = 0x{:x}", - default_max_packet_size_words); - log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); - log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); - log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); - log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); - log_info( - LogTest, - " --routing_table_start_addr: Routing Table start address, default = 0x{:x}", - default_routing_table_start_addr); - log_info( - LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); - log_info( - LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); - log_info( - LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); - log_info( - LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); - log_info( - LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); - log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); - log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); - log_info( - LogTest, - " --check_txrx_timeout: Check if timeout happens during tx & rx (if enabled, timeout_mcycles will also be " - "used), default = {}", - default_check_txrx_timeout); - log_info( - LogTest, - " --rx_disable_data_check: Disable data check on RX, default = {}", - default_rx_disable_data_check); - log_info( - LogTest, - " --rx_disable_header_check: Disable header check on RX, default = {}", - default_rx_disable_header_check); - log_info( - LogTest, - " --tx_skip_pkt_content_gen: Skip packet content generation during tx, default = {}", - default_tx_skip_pkt_content_gen); - log_info( - LogTest, - " --tx_pkt_dest_size_choice: choice for how packet destination and packet size are generated, default = " - "{}", - default_tx_pkt_dest_size_choice); // pkt_dest_size_choices_t - log_info( - LogTest, - " --tx_data_sent_per_iter_low: the criteria to determine the amount of tx data sent per iter is low " - "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", - default_tx_data_sent_per_iter_low); - log_info( - LogTest, - " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high " - "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", - default_tx_data_sent_per_iter_high); - log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); - log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); - log_info( - LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id_l); - log_info( - LogTest, " --device_id_r: Device on which the test will be run, default = {}", default_test_device_id_r); - return 0; - } - - uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); - uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); - uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); - uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); - uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); - uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); - uint32_t data_kb_per_tx = - test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); - uint32_t max_packet_size_words = - test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); - uint32_t routing_table_start_addr = test_args::get_command_option_uint32( - input_args, "--routing_table_start_addr", default_routing_table_start_addr); - uint32_t tx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); - uint32_t tx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); - uint32_t rx_queue_start_addr = - test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); - uint32_t rx_queue_size_bytes = - test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); - uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( - input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); - uint32_t test_results_addr = - test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); - uint32_t test_results_size = - test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); - uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); - uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( - input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); - uint32_t timeout_mcycles = - test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); - uint32_t rx_disable_data_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); - uint32_t rx_disable_header_check = - test_args::get_command_option_uint32(input_args, "--rx_disable_header_check", default_rx_disable_header_check); - uint32_t tx_skip_pkt_content_gen = - test_args::get_command_option_uint32(input_args, "--tx_skip_pkt_content_gen", default_tx_skip_pkt_content_gen); - uint32_t dump_stat_json = - test_args::get_command_option_uint32(input_args, "--dump_stat_json", default_dump_stat_json); - std::string output_dir = test_args::get_command_option(input_args, "--output_dir", std::string(default_output_dir)); - uint32_t check_txrx_timeout = - test_args::get_command_option_uint32(input_args, "--check_txrx_timeout", default_check_txrx_timeout); - uint8_t tx_pkt_dest_size_choice = (uint8_t)test_args::get_command_option_uint32( - input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); - uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32( - input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); - uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32( - input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); - uint32_t fabric_command = - test_args::get_command_option_uint32(input_args, "--fabric_command", default_fabric_command); - uint32_t target_address = - test_args::get_command_option_uint32(input_args, "--target_address", default_target_address); - uint32_t atomic_increment = - test_args::get_command_option_uint32(input_args, "--atomic_increment", default_atomic_increment); - assert( - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && - rx_disable_header_check || - (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); - - uint32_t test_device_id_l = - test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id_l); - uint32_t test_device_id_r = - test_args::get_command_option_uint32(input_args, "--device_id_r", default_test_device_id_r); - - uint32_t tx_signal_address = default_tx_signal_address; - - bool pass = true; - - CoreCoord gk_core = {gk_x, gk_y}; - - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; - - try { - const std::filesystem::path tg_mesh_graph_desc_path = - std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; - auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); - - int num_devices = tt_metal::GetNumAvailableDevices(); - if (test_device_id_l >= num_devices) { - log_info( - LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id_l, num_devices - 1); - throw std::runtime_error("Invalid Device Id."); - } - - std::map device_map; - - std::vector chip_ids; - for (unsigned int id = 4; id < 36; id++) { - chip_ids.push_back(id); - } - device_map = tt::tt_metal::detail::CreateDevices(chip_ids); - - log_info(LogTest, "Created {} Devices ...", device_map.size()); - - std::map program_map; - - for (uint32_t i = 4; i < 36; i++) { - program_map[i] = tt_metal::CreateProgram(); - } - - log_info(LogTest, "Created Programs ..."); - - std::map> device_router_map; - - const auto& device_active_eth_cores = device_map[test_device_id_l]->get_active_ethernet_cores(); - - if (device_active_eth_cores.size() == 0) { - log_info( - LogTest, - "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", - test_device_id_l); - for (auto dev : device_map) { - tt_metal::CloseDevice(dev.second); - } - - throw std::runtime_error("Test cannot run on specified device."); - } - - auto [dev_l_mesh_id, dev_l_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_l); - auto [dev_r_mesh_id, dev_r_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_r); - - log_info( - LogTest, - "Running on Left Device {} : Fabric Mesh Id {} : Fabric Device Id {}", - test_device_id_l, - dev_l_mesh_id, - dev_l_chip_id); - log_info( - LogTest, - "Running on Right Device {} : Fabric Mesh Id {} : Fabric Device Id {}", - test_device_id_r, - dev_r_mesh_id, - dev_r_chip_id); - - bool router_core_found = false; - CoreCoord router_logical_core; - CoreCoord router_phys_core; - CoreCoord gk_phys_core; - uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t gk_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - uint32_t client_interface_addr = routing_table_addr + sizeof(tt::tt_fabric::fabric_router_l1_config_t) * 4; - uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(tt_fabric_client_interface_t); - uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); - log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); - log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); - log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); - - for (auto device : device_map) { - auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.first); - std::vector device_router_cores; - std::vector device_router_phys_cores; - uint32_t router_mask = 0; - for (auto neighbor : neighbors) { - if (device_map.contains(neighbor)) { - if (!router_core_found && device.first == test_device_id_l) { - // pick a router so that tx and read in routing tables from this core on the - // sender device. - router_logical_core = device.second->get_ethernet_sockets(neighbor)[0]; - router_phys_core = device.second->ethernet_core_from_logical_core(router_logical_core); - router_core_found = true; - } - auto connected_logical_cores = device.second->get_ethernet_sockets(neighbor); - for (auto logical_core : connected_logical_cores) { - device_router_cores.push_back(logical_core); - device_router_phys_cores.push_back( - device.second->ethernet_core_from_logical_core(logical_core)); - router_mask += 0x1 << logical_core.y; - } - } else { - log_debug( - LogTest, - "Device {} skiping Neighbor Device {} since it is not in test device map.", - device.first, - neighbor); - } - } - - log_info(LogTest, "Device {} router_mask = 0x{:04X}", device.first, router_mask); - uint32_t sem_count = device_router_cores.size(); - device_router_map[device.first] = device_router_phys_cores; - - gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); - uint32_t gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); - - std::vector router_compile_args = { - (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words - tunneler_test_results_addr, // 1: test_results_addr - tunneler_test_results_size, // 2: test_results_size - 0, // 3: timeout_cycles - }; - - std::vector router_runtime_args = { - sem_count, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - gk_interface_addr, // 2: gk_message_addr_l - gk_noc_offset, // 3: gk_message_addr_h - }; - - for (auto logical_core : device_router_cores) { - auto router_kernel = tt_metal::CreateKernel( - program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_router.cpp", - logical_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); - - tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, router_runtime_args); - - log_debug( - LogTest, - "Device {} router added on physical core {}", - device.first, - device.second->ethernet_core_from_logical_core(logical_core)); - } - // setup runtime args - log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); - std::vector gk_compile_args = { - gk_interface_addr, // 0: - socket_info_addr, // 1: - routing_table_addr, // 2 - test_results_addr, // 3: test_results_addr - test_results_size, // 4: test_results_size - 0, // 5: timeout_cycles - }; - - std::vector gk_runtime_args = { - sem_count, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - }; - - auto kernel = tt_metal::CreateKernel( - program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", - {gk_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = gk_compile_args, - .defines = defines}); - - tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, gk_runtime_args); - } - - if (check_txrx_timeout) { - defines["CHECK_TIMEOUT"] = ""; - } - - std::vector tx_phys_core; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; - tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); - CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); - std::vector compile_args = { - (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - dest_endpoint_start_id, // 2: - tx_queue_start_addr, // 3: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 4: queue_size_words - routing_table_start_addr, // 5: routeing table - gk_interface_addr, // 6: gk_message_addr_l - (tx_gk_phys_core.y << 10) | (tx_gk_phys_core.x << 4), // 7: gk_message_addr_h - test_results_addr, // 8: test_results_addr - test_results_size, // 9: test_results_size - prng_seed, // 10: prng_seed - data_kb_per_tx, // 11: total_data_kb - max_packet_size_words, // 12: max_packet_size_words - timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles - tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low - tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high - fabric_command, // 18: fabric command - target_address, - atomic_increment, - tx_signal_address, - client_interface_addr, - client_pull_req_buf_addr, - }; - - // setup runtime args - std::vector runtime_args = { - (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id - 0x410, // 1: dest_noc_offset - router_phys_core.x, - router_phys_core.y, - (dev_r_mesh_id << 16 | dev_r_chip_id)}; - - if (ASYNC_WR == fabric_command) { - runtime_args.push_back(target_address); - } - - log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_map[test_device_id_l], - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - - tt_metal::SetRuntimeArgs(program_map[test_device_id_l], kernel, core, runtime_args); - } - - std::vector rx_phys_core; - for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; - rx_phys_core.push_back(device_map[test_device_id_r]->worker_core_from_logical_core(core)); - CoreCoord rx_gk_phys_core = device_map[test_device_id_r]->worker_core_from_logical_core(gk_core); - - std::vector compile_args = { - prng_seed, // 0: prng seed - data_kb_per_tx, // 1: total data kb - max_packet_size_words, // 2: max packet size (in words) - fabric_command, // 3: fabric command - target_address, // 4: target address - atomic_increment, // 5: atomic increment - test_results_addr, // 6: test results addr - test_results_size, // 7: test results size in bytes - gk_interface_addr, // 8: gk_message_addr_l - (rx_gk_phys_core.y << 10) | (rx_gk_phys_core.x << 4), // 9: gk_message_addr_h - client_interface_addr, // 10: - client_pull_req_buf_addr, // 11: - rx_queue_start_addr, // 12: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 13: queue_size_words - }; - - std::vector runtime_args = {(dev_l_mesh_id << 16 | dev_l_chip_id)}; - - log_info(LogTest, "run socket rx at x={},y={}", core.x, core.y); - auto kernel = tt_metal::CreateKernel( - program_map[test_device_id_r], - "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp", - {core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - - tt_metal::SetRuntimeArgs(program_map[test_device_id_r], kernel, core, runtime_args); - } - - if (check_txrx_timeout) { - defines.erase("CHECK_TIMEOUT"); - } - - log_info(LogTest, "Starting test..."); - - auto start = std::chrono::system_clock::now(); - - // Initialize tt_fabric router sync semaphore to 0. Routers on each device - // increment once handshake with ethernet peer has been completed. - std::vector zero_buf(1, 0); - std::vector gk_zero_buf(12, 0); - for (auto [device_id, router_phys_cores] : device_router_map) { - for (auto phys_core : router_phys_cores) { - tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); - } - tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); - } - - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], zero_buf, tx_signal_address); - } - - // clear test status to 0. it will be set to non-zero value by tx kernel. - tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[0], zero_buf, test_results_addr); - - for (auto device : device_map) { - log_info(LogTest, "Launching on {}", device.first); - tt_metal::detail::LaunchProgram(device.second, program_map[device.first], false); - } - - // Once all the kernels have been launched on all devices, set the tx_start signal - // to trigger tx kernels to start sending data. - std::vector tx_start(1, 1); - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], tx_start, tx_signal_address); - } - - // Wait for tx to return non-zero status. - while (1) { - auto tx_status = tt::llrt::read_hex_vec_from_core(test_device_id_l, tx_phys_core[0], test_results_addr, 4); - if ((tx_status[0] & 0xFFFF) != 0) { - break; - } - } - - log_info(LogTest, "Tx Finished"); - - // terminate gatekeeper. - // Gatekeeper will signal all routers on the device to terminate. - for (auto [device_id, router_phys_cores] : device_router_map) { - tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); - } - - // wait for all kernels to finish. - for (auto device : device_map) { - tt_metal::detail::WaitProgramDone(device.second, program_map[device.first]); - } - - auto end = std::chrono::system_clock::now(); - - std::chrono::duration elapsed_seconds = (end - start); - log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); - - vector> tx_results; - - for (uint32_t i = 0; i < num_src_endpoints; i++) { - tx_results.push_back(tt::llrt::read_hex_vec_from_core( - device_map[test_device_id_l]->id(), tx_phys_core[i], test_results_addr, 128)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - } - /* - TODO: Need to add these once control plane api is available to - get the first and last hop fabric router core. - vector router_results = - tt::llrt::read_hex_vec_from_core( - device_map[test_device_id_l]->id(), tunneler_phys_core, tunneler_test_results_addr, 128); - log_info(LogTest, "L Router status = {}", - packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= - (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - - vector r_router_results = - tt::llrt::read_hex_vec_from_core( - device_map[test_device_id_r]->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); - log_info(LogTest, "R Router status = {}", - packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= - (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); - */ - for (auto active_device : device_map) { - pass &= tt_metal::CloseDevice(active_device.second); - } - - if (pass) { - double total_tx_bw = 0.0; - uint64_t total_tx_words_sent = 0; - uint64_t total_rx_words_checked = 0; - for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); - total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); - double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; - total_tx_bw += tx_bw; - uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); - // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); - // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); - // uint64_t many_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); - // uint64_t num_packets = get_64b_result(tx_results[i], TX_TEST_IDX_NPKT); - // double bytes_per_pkt = static_cast(tx_words_sent) * PACKET_WORD_SIZE_BYTES / - // static_cast(num_packets); - - log_info( - LogTest, - "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", - i, - tx_words_sent, - tx_elapsed_cycles, - tx_bw); - // log_info(LogTest, "TX {} packets sent = {}, bytes/packet = {:.2f}, total iter = {}, zero data sent - // iter = {}, few data sent iter = {}, many data sent iter = {}", i, num_packets, bytes_per_pkt, iter, - // zero_data_sent_iter, few_data_sent_iter, many_data_sent_iter); - /* - stat[fmt::format("tx_words_sent_{}", i)] = tx_words_sent; - stat[fmt::format("tx_elapsed_cycles_{}", i)] = tx_elapsed_cycles; - stat[fmt::format("tx_bw_{}", i)] = tx_bw; - stat[fmt::format("tx_bytes_per_pkt_{}", i)] = bytes_per_pkt; - stat[fmt::format("tx_total_iter_{}", i)] = iter; - stat[fmt::format("tx_zero_data_sent_iter_{}", i)] = zero_data_sent_iter; - stat[fmt::format("tx_few_data_sent_iter_{}", i)] = few_data_sent_iter; - stat[fmt::format("tx_many_data_sent_iter_{}", i)] = many_data_sent_iter; - */ - } - log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); - } - - } catch (const std::exception& e) { - pass = false; - log_fatal(e.what()); - } - - tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); - - if (pass) { - log_info(LogTest, "Test Passed"); - return 0; - } else { - log_fatal(LogTest, "Test Failed\n"); - return 1; - } -} diff --git a/tt_fabric/hw/inc/routing_table.h b/tt_fabric/hw/inc/routing_table.h index 70c862cc009..aac39816745 100644 --- a/tt_fabric/hw/inc/routing_table.h +++ b/tt_fabric/hw/inc/routing_table.h @@ -57,7 +57,6 @@ struct fabric_router_l1_config_t { std::uint16_t my_mesh_id; // Do we need this if we tag routing tables with magic values for outbound eth channels // and route to local NOC? std::uint16_t my_device_id; - std::uint8_t padding[8]; // pad to 16-byte alignment. } __attribute__((packed)); /* diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index 60e5728945d..d5196c6605a 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -9,9 +9,9 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "tt_fabric/hw/inc/routing_table.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/eth_chan_noc_mapping.h" +#include "hw/inc/routing_table.h" +#include "hw/inc/tt_fabric_interface.h" +#include "hw/inc/eth_chan_noc_mapping.h" constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); @@ -25,10 +25,6 @@ extern volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table; uint64_t tt_fabric_send_pull_request(uint64_t dest_addr, volatile local_pull_request_t* local_pull_request); bool tt_fabric_is_header_valid(packet_header_t* p_header); -uint32_t num_words_available_to_pull(volatile pull_request_t* pull_request); -uint32_t words_before_buffer_wrap(uint32_t buffer_size, uint32_t rd_ptr); -uint32_t advance_ptr(uint32_t buffer_size, uint32_t ptr, uint32_t inc_words); -uint32_t get_rd_ptr_offset_words(pull_request_t* pull_request); inline uint64_t get_timestamp() { uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); @@ -475,7 +471,7 @@ typedef struct fvc_producer_state { } } - template + template inline uint32_t pull_data_from_fvc_buffer() { uint32_t words_available = get_num_words_available(); words_available = std::min(words_available, packet_words_remaining); @@ -497,10 +493,7 @@ typedef struct fvc_producer_state { packet_words_remaining -= words_available; advance_out_rdptr(words_available); // issue noc write to noc target of pull request. - uint64_t dest_addr = socket_mode == false - ? ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START - : ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; + uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START; packet_dest = tt_fabric_send_pull_request(dest_addr, local_pull_request); if (current_packet_header.routing.flags == INLINE_FORWARD) { curr_packet_valid = false; @@ -554,59 +547,51 @@ typedef struct fvc_producer_state { inline uint32_t process_inbound_packet() { uint32_t words_processed = 0; if (packet_is_for_local_chip()) { - if (current_packet_header.routing.flags == FORWARD) { - if (current_packet_header.session.command == ASYNC_WR) { - if (packet_in_progress == 0) { - packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; - packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; - advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); - advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); - // subtract the header words. Remaining words are the data to be written to packet_dest. - // Remember to account for trailing bytes which may not be a full packet word. - packet_in_progress = 1; - words_processed = PACKET_HEADER_SIZE_WORDS; - words_processed += issue_async_write(); - } else { - flush_async_writes(); - if (packet_words_remaining) { - words_processed = issue_async_write(); - } else { - packet_in_progress = 0; - curr_packet_valid = false; - packet_timestamp = get_timestamp(); - } - } - } else if (current_packet_header.session.command == DSOCKET_WR) { - words_processed = pull_data_from_fvc_buffer(); - } - } else if (current_packet_header.routing.flags == INLINE_FORWARD) { - if (current_packet_header.session.command == SOCKET_CLOSE) { - words_processed = pull_data_from_fvc_buffer(); - } else { - uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - current_packet_header.packet_parameters.atomic_parameters.increment, - current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, - false); - + if (current_packet_header.routing.flags == FORWARD && current_packet_header.session.command == ASYNC_WR) { + if (packet_in_progress == 0) { + packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); + // subtract the header words. Remaining words are the data to be written to packet_dest. + // Remember to account for trailing bytes which may not be a full packet word. + packet_in_progress = 1; words_processed = PACKET_HEADER_SIZE_WORDS; - fvc_pull_rdptr = fvc_out_rdptr; - update_remote_rdptr_cleared(); - curr_packet_valid = false; - packet_timestamp = get_timestamp(); + words_processed += issue_async_write(); + } else { + flush_async_writes(); + if (packet_words_remaining) { + words_processed = issue_async_write(); + } else { + packet_in_progress = 0; + curr_packet_valid = false; + packet_timestamp = get_timestamp(); + } } + } else if (current_packet_header.routing.flags == INLINE_FORWARD) { + uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + current_packet_header.packet_parameters.atomic_parameters.increment, + current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, + false); + + packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; + advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); + advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); + words_processed = PACKET_HEADER_SIZE_WORDS; + fvc_pull_rdptr = fvc_out_rdptr; + update_remote_rdptr_cleared(); + curr_packet_valid = false; + packet_timestamp = get_timestamp(); } } else { - words_processed = pull_data_from_fvc_buffer(); + words_processed = pull_data_from_fvc_buffer(); } return words_processed; } @@ -619,518 +604,6 @@ typedef struct fvc_producer_state { } } fvc_producer_state_t; -typedef struct fvcc_outbound_state { - volatile chan_payload_ptr remote_rdptr; - uint32_t remote_ptr_update_addr; - volatile ctrl_chan_msg_buf* - fvcc_buf; // fvcc buffer that receives messages that need to be forwarded over ethernet. - volatile ctrl_chan_sync_buf* fvcc_sync_buf; // sync buffer to hold pointer updates sent over ethernet - uint32_t remote_fvcc_buf_start; - uint32_t out_rdptr; - - // Check if ethernet receiver fvcc on neighboring chip is full. - bool is_remote_fvcc_full() { - uint32_t rd_ptr = remote_rdptr.ptr_cleared; - bool full = false; - if (out_rdptr != rd_ptr) { - uint32_t distance = out_rdptr >= rd_ptr ? out_rdptr - rd_ptr : out_rdptr + 2 * FVCC_BUF_SIZE - rd_ptr; - full = distance >= FVCC_BUF_SIZE; - } - return full; - } - - inline void init(uint32_t buf_start, uint32_t sync_buf_start, uint32_t remote_buf_start, uint32_t ptr_update_addr) { - uint32_t words = sizeof(fvcc_outbound_state) / 4; - uint32_t* ptr = (uint32_t*)this; - for (uint32_t i = 0; i < words; i++) { - ptr[i] = 0; - } - fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; - fvcc_sync_buf = (volatile ctrl_chan_sync_buf*)sync_buf_start; - remote_fvcc_buf_start = remote_buf_start; - remote_ptr_update_addr = ptr_update_addr; - } - - inline uint32_t inc_ptr_with_wrap(uint32_t ptr) { return (ptr + 1) & FVCC_PTR_MASK; } - - inline void advance_out_rdptr() { out_rdptr = inc_ptr_with_wrap(out_rdptr); } - - inline void advance_fvcc_rdptr() { - uint32_t rd_ptr = remote_rdptr.ptr; - while (rd_ptr != fvcc_buf->rdptr.ptr) { - uint32_t msg_index = fvcc_buf->rdptr.ptr & FVCC_SIZE_MASK; - fvcc_buf->msg_buf[msg_index].packet_header.routing.flags = 0; - fvcc_buf->rdptr.ptr = inc_ptr_with_wrap(fvcc_buf->rdptr.ptr); - } - } - - template - inline uint32_t forward_data_from_fvcc_buffer() { - // If receiver ethernet fvcc is full, we cannot send more messages. - if (is_remote_fvcc_full()) { - return 0; - } - - if (fvcc_buf->wrptr.ptr != out_rdptr) { - // There are new messages to forward. - uint32_t msg_index = out_rdptr & FVCC_SIZE_MASK; - volatile packet_header_t* msg = &fvcc_buf->msg_buf[msg_index].packet_header; - bool msg_valid = msg->routing.flags != 0; - if (!msg_valid) { - return 0; - } - - uint32_t dest_addr = - remote_fvcc_buf_start + offsetof(ctrl_chan_msg_buf, msg_buf) + msg_index * sizeof(packet_header_t); - internal_::eth_send_packet( - 0, - (uint32_t)msg / PACKET_WORD_SIZE_BYTES, - dest_addr / PACKET_WORD_SIZE_BYTES, - PACKET_HEADER_SIZE_WORDS); - advance_out_rdptr(); - - uint32_t* sync_ptr = (uint32_t*)&fvcc_sync_buf->ptr[msg_index]; - sync_ptr[0] = out_rdptr; - sync_ptr[1] = 0; - sync_ptr[2] = 0; - sync_ptr[3] = out_rdptr; - internal_::eth_send_packet( - 0, ((uint32_t)sync_ptr) / PACKET_WORD_SIZE_BYTES, remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, 1); - } - - return PACKET_HEADER_SIZE_WORDS; - } - - inline void fvcc_handler() { - forward_data_from_fvcc_buffer(); - advance_fvcc_rdptr(); - } - -} fvcc_outbound_state_t; - -static_assert(sizeof(fvcc_outbound_state_t) % 4 == 0); - -// Fabric Virtual Control Channel (FVCC) Producer receives control/sync packets over ethernet from neighboring chip. -// Data in the producer is either destined for local chip, or has to make a noc hop -// to next outgoing ethernet port enroute to final destination. -// Control packets are forwarded to next fvcc consumer buffer in the route -// direction, if not meant for local device. -// If control packet is addressed to local device, FVCC producer can process the packet locally if -// it is a read/write ack, or forward the packet to Gatekeeper for further local processing. -typedef struct fvcc_inbound_state { - volatile chan_payload_ptr inbound_wrptr; - volatile chan_payload_ptr inbound_rdptr; - uint32_t remote_ptr_update_addr; - volatile ctrl_chan_msg_buf* fvcc_buf; // fvcc buffer that receives incoming control messages over ethernet. - uint8_t chan_num; - uint8_t packet_in_progress; - uint8_t pad1; - uint8_t pad2; - uint32_t fvc_out_wrptr; - uint32_t fvc_out_rdptr; - bool curr_packet_valid; - bool packet_corrupted; - uint64_t packet_timestamp; - uint64_t gk_fvcc_buf_addr; // fvcc buffer in gatekeeper. - volatile packet_header_t* current_packet_header; - - inline void init(uint32_t buf_start, uint32_t ptr_update_addr, uint64_t gk_fvcc_buf_start) { - uint32_t words = sizeof(fvcc_inbound_state) / 4; - uint32_t* ptr = (uint32_t*)this; - for (uint32_t i = 0; i < words; i++) { - ptr[i] = 0; - } - chan_num = 1; - fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; - gk_fvcc_buf_addr = gk_fvcc_buf_start; - remote_ptr_update_addr = ptr_update_addr; - } - - inline uint32_t inc_ptr_with_wrap(uint32_t ptr, uint32_t inc) { return (ptr + inc) & FVCC_PTR_MASK; } - - inline void advance_local_wrptr(uint32_t inc) { inbound_wrptr.ptr = inc_ptr_with_wrap(inbound_wrptr.ptr, inc); } - - inline void advance_out_wrptr(uint32_t inc) { fvc_out_wrptr = inc_ptr_with_wrap(fvc_out_wrptr, inc); } - - inline void advance_out_rdptr(uint32_t inc) { fvc_out_rdptr = inc_ptr_with_wrap(fvc_out_rdptr, inc); } - - inline uint32_t get_num_msgs_available() const { - uint32_t wrptr = inbound_wrptr.ptr; - uint32_t msgs = 0; - if (fvc_out_rdptr != wrptr) { - msgs = wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : wrptr + 2 * FVCC_BUF_SIZE - fvc_out_rdptr; - } - return msgs; - } - - inline uint32_t get_num_msgs_free() { return FVCC_BUF_SIZE - get_num_msgs_available(); } - - inline uint32_t words_before_local_buffer_wrap() { - if (inbound_wrptr.ptr >= FVCC_BUF_SIZE) { - return (FVCC_BUF_SIZE * 2 - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; - } else { - return (FVCC_BUF_SIZE - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; - } - } - - inline bool get_curr_packet_valid() { - if (!curr_packet_valid && (get_num_msgs_available() >= 1)) { - uint32_t msg_index = fvc_out_rdptr & FVCC_SIZE_MASK; - bool msg_valid = fvcc_buf->msg_buf[msg_index].packet_header.routing.flags != 0; - if (msg_valid) { - current_packet_header = (volatile packet_header_t*)&fvcc_buf->msg_buf[msg_index]; - if (tt_fabric_is_header_valid((packet_header_t*)current_packet_header)) { - this->curr_packet_valid = true; - } else { - this->packet_corrupted = true; - } - } - } - return this->curr_packet_valid; - } - - inline uint32_t get_local_buffer_read_addr() { - uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[fvc_out_rdptr & FVCC_SIZE_MASK]; - return addr; - } - - inline uint32_t get_local_buffer_write_addr() { - uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[inbound_wrptr.ptr & FVCC_SIZE_MASK]; - ; - return addr; - } - - template - inline void update_remote_rdptr_sent() { - if (inbound_wrptr.ptr_cleared != inbound_rdptr.ptr) { - inbound_rdptr.ptr = inbound_wrptr.ptr_cleared; - if constexpr (fvc_mode == FVC_MODE_ROUTER) { - internal_::eth_send_packet( - 0, - ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, - remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, - 1); - } - } - } - - template - inline void update_remote_rdptr_cleared() { - if (fvc_out_rdptr != inbound_rdptr.ptr_cleared) { - inbound_rdptr.ptr_cleared = fvc_out_rdptr; - if constexpr (fvc_mode == FVC_MODE_ROUTER) { - internal_::eth_send_packet( - 0, - ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, - remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, - 1); - } - } - } - - uint32_t get_next_hop_router_noc_xy() { - uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; - if (dst_mesh_id != routing_table->my_mesh_id) { - uint32_t next_port = routing_table->inter_mesh_table.dest_entry[dst_mesh_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } else { - uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; - uint32_t next_port = routing_table->intra_mesh_table.dest_entry[dst_device_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } - } - - inline bool packet_is_for_local_chip() { - return (current_packet_header->routing.dst_mesh_id == routing_table->my_mesh_id) && - (current_packet_header->routing.dst_dev_id == routing_table->my_device_id); - } - - // issue a pull request. - // currently blocks till the request queue has space. - // This needs to be non blocking, so that if one fvc pull request queue is full, - // we can process other fvcs and come back to check status of this pull request later. - inline void forward_message(uint64_t dest_addr) { - uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - 1, - FVCC_BUF_LOG_SIZE, - false, - false, - (uint32_t)&fvcc_buf->wrptr.ptr); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); - uint32_t wrptr = fvcc_buf->wrptr.ptr; - noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); - while (1) { - noc_async_read_one_packet(noc_addr, (uint32_t)(&fvcc_buf->rdptr.ptr), 4); - noc_async_read_barrier(); - if (!fvcc_buf_ptrs_full(wrptr, fvcc_buf->rdptr.ptr)) { - break; - } -#if defined(COMPILE_FOR_ERISC) - else { - // Consumer pull request buffer is full - // Context switch to enable base firmware routing - // as it might be handling slow dispatch traffic - internal_::risc_context_switch(); - } -#endif - } - uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; - noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); - noc_async_write_one_packet((uint32_t)(current_packet_header), noc_addr, sizeof(packet_header_t), noc_index); - } - - template - inline void process_inbound_packet() { - if (packet_is_for_local_chip()) { - if (current_packet_header->routing.flags == SYNC) { - if (current_packet_header->session.command == SOCKET_OPEN or - current_packet_header->session.command == SOCKET_CONNECT) { - // forward socket related messages to gatekeeper. - forward_message(gk_fvcc_buf_addr); - } else if (current_packet_header->session.command == ASYNC_WR_RESP) { - // Write response. Decrement transaction count for respective transaction id. - uint64_t noc_addr = ((uint64_t)current_packet_header->session.target_offset_h << 32) | - current_packet_header->session.target_offset_l; - noc_fast_atomic_increment( - noc_index, NCRISC_AT_CMD_BUF, noc_addr, NOC_UNICAST_WRITE_VC, -1, 31, false); - } - } - } else { - // Control message is not meant for local chip. Forward to next router enroute to destination. - uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FVCC_OUT_BUF_START; - forward_message(dest_addr); - } - curr_packet_valid = false; - advance_out_wrptr(1); - advance_out_rdptr(1); - noc_async_write_barrier(); - update_remote_rdptr_cleared(); - } - - template - inline void fvcc_handler() { - if (get_curr_packet_valid()) { - process_inbound_packet(); - } - update_remote_rdptr_sent(); - } - -} fvcc_inbound_state_t; - -typedef struct socket_reader_state { - volatile chan_payload_ptr remote_rdptr; - uint8_t packet_in_progress; - - uint32_t packet_words_remaining; - uint32_t fvc_out_wrptr; - uint32_t fvc_out_rdptr; - uint32_t fvc_pull_wrptr; - uint32_t buffer_size; - uint32_t buffer_start; - uint32_t remote_buffer_start; - uint32_t pull_words_in_flight; - uint32_t words_since_last_sync; - - uint32_t get_num_words_free() { - uint32_t rd_ptr = remote_rdptr.ptr; - uint32_t words_occupied = 0; - if (fvc_pull_wrptr != rd_ptr) { - words_occupied = - fvc_pull_wrptr > rd_ptr ? fvc_pull_wrptr - rd_ptr : buffer_size * 2 + fvc_pull_wrptr - rd_ptr; - } - return buffer_size - words_occupied; - } - - uint32_t get_remote_num_words_free() { - uint32_t rd_ptr = remote_rdptr.ptr_cleared; - uint32_t words_occupied = 0; - if (fvc_out_wrptr != rd_ptr) { - words_occupied = fvc_out_wrptr > rd_ptr ? fvc_out_wrptr - rd_ptr : buffer_size * 2 + fvc_out_wrptr - rd_ptr; - } - return buffer_size - words_occupied; - } - - inline void init(uint32_t data_buf_start, uint32_t data_buf_size_words) { - uint32_t words = sizeof(socket_reader_state) / 4; - uint32_t* ptr = (uint32_t*)this; - for (uint32_t i = 0; i < words; i++) { - ptr[i] = 0; - } - buffer_start = data_buf_start; - buffer_size = data_buf_size_words; - remote_buffer_start = data_buf_start + buffer_size * PACKET_WORD_SIZE_BYTES; - } - - inline uint32_t words_before_local_buffer_wrap() { - if (fvc_pull_wrptr >= buffer_size) { - return buffer_size * 2 - fvc_pull_wrptr; - } else { - return buffer_size - fvc_pull_wrptr; - } - } - - inline uint32_t get_local_buffer_pull_addr() { - uint32_t addr = buffer_start; - uint32_t offset = fvc_pull_wrptr; - if (offset >= buffer_size) { - offset -= buffer_size; - } - addr = addr + (offset * PACKET_WORD_SIZE_BYTES); - return addr; - } - - inline uint32_t get_local_buffer_read_addr() { - uint32_t addr = buffer_start; - uint32_t offset = fvc_out_rdptr; - if (offset >= buffer_size) { - offset -= buffer_size; - } - addr = addr + (offset * PACKET_WORD_SIZE_BYTES); - return addr; - } - - inline uint32_t get_remote_buffer_write_addr() { - uint32_t addr = remote_buffer_start; - uint32_t offset = fvc_out_wrptr; - if (offset >= buffer_size) { - offset -= buffer_size; - } - addr = addr + (offset * PACKET_WORD_SIZE_BYTES); - return addr; - } - - inline void advance_pull_wrptr(uint32_t num_words) { - uint32_t temp = fvc_pull_wrptr + num_words; - if (temp >= buffer_size * 2) { - temp -= buffer_size * 2; - } - fvc_pull_wrptr = temp; - } - - inline void advance_out_wrptr(uint32_t num_words) { - uint32_t temp = fvc_out_wrptr + num_words; - if (temp >= buffer_size * 2) { - temp -= buffer_size * 2; - } - fvc_out_wrptr = temp; - } - - inline void advance_out_rdptr(uint32_t num_words) { - uint32_t temp = fvc_out_rdptr + num_words; - if (temp >= buffer_size * 2) { - temp -= buffer_size * 2; - } - fvc_out_rdptr = temp; - } - - inline void register_pull_data(uint32_t num_words_to_pull) { - pull_words_in_flight += num_words_to_pull; - advance_pull_wrptr(num_words_to_pull); - words_since_last_sync += num_words_to_pull; - packet_words_remaining -= num_words_to_pull; - } - - inline uint32_t get_num_words_to_pull(volatile pull_request_t* pull_request) { - uint32_t num_words_to_pull = num_words_available_to_pull(pull_request); - uint32_t num_words_before_wrap = words_before_buffer_wrap(pull_request->buffer_size, pull_request->rd_ptr); - - num_words_to_pull = std::min(num_words_to_pull, num_words_before_wrap); - uint32_t socket_buffer_space = get_num_words_free(); - num_words_to_pull = std::min(num_words_to_pull, socket_buffer_space); - - if (num_words_to_pull == 0) { - return 0; - } - - uint32_t space_before_wptr_wrap = words_before_local_buffer_wrap(); - num_words_to_pull = std::min(num_words_to_pull, space_before_wptr_wrap); - - return num_words_to_pull; - } - - inline uint32_t pull_socket_data(volatile pull_request_t* pull_request) { - volatile uint32_t* temp = (volatile uint32_t*)0xffb2010c; - if (packet_in_progress == 0) { - uint32_t size = pull_request->size; - packet_words_remaining = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; - packet_in_progress = 1; - } - - uint32_t num_words_to_pull = get_num_words_to_pull(pull_request); - if (num_words_to_pull == 0) { - temp[0] = 0xdead1111; - return 0; - } - - uint32_t rd_offset = get_rd_ptr_offset_words((pull_request_t*)pull_request); - uint64_t src_addr = pull_request->buffer_start + (rd_offset * PACKET_WORD_SIZE_BYTES); - uint32_t local_addr = get_local_buffer_pull_addr(); - - // pull_data_from_remote(); - noc_async_read(src_addr, local_addr, num_words_to_pull * PACKET_WORD_SIZE_BYTES); - register_pull_data(num_words_to_pull); - pull_request->rd_ptr = advance_ptr(pull_request->buffer_size, pull_request->rd_ptr, num_words_to_pull); - - return num_words_to_pull; - } - - template - inline uint32_t push_socket_data() { - uint32_t total_words_to_forward = 0; - uint32_t wrptr = fvc_pull_wrptr; - - total_words_to_forward = - wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : buffer_size * 2 + wrptr - fvc_out_rdptr; - - uint32_t remote_fvc_buffer_space = get_remote_num_words_free(); - total_words_to_forward = std::min(total_words_to_forward, remote_fvc_buffer_space); - if (total_words_to_forward == 0) { - return 0; - } - - if (packet_words_remaining and (words_since_last_sync < FVC_SYNC_THRESHOLD)) { - // not enough data to forward. - // wait for more data. - return 0; - } - - if constexpr (live == true) { - uint32_t src_addr = 0; - uint32_t dest_addr = 0; // should be second half of fvc buffer. - uint32_t words_remaining = total_words_to_forward; - while (words_remaining) { - uint32_t num_words_before_local_wrap = words_before_buffer_wrap(buffer_size, fvc_out_rdptr); - uint32_t num_words_before_remote_wrap = words_before_buffer_wrap(buffer_size, fvc_out_wrptr); - uint32_t words_to_forward = std::min(num_words_before_local_wrap, num_words_before_remote_wrap); - words_to_forward = std::min(words_to_forward, words_remaining); - // max 8K bytes - words_to_forward = std::min(words_to_forward, DEFAULT_MAX_NOC_SEND_WORDS); - src_addr = get_local_buffer_read_addr(); - dest_addr = get_remote_buffer_write_addr(); - - // TODO: Issue a noc write here to forward data. - - advance_out_rdptr(words_to_forward); - advance_out_wrptr(words_to_forward); - words_remaining -= words_to_forward; - } - } else { - advance_out_rdptr(total_words_to_forward); - advance_out_wrptr(total_words_to_forward); - remote_rdptr.ptr = fvc_out_rdptr; - remote_rdptr.ptr_cleared = fvc_out_wrptr; - } - words_since_last_sync -= total_words_to_forward; - return total_words_to_forward; - } -} socket_reader_state_t; - -static_assert(sizeof(socket_reader_state_t) % 4 == 0); typedef struct router_state { uint32_t sync_in; @@ -1142,6 +615,28 @@ typedef struct router_state { inline uint64_t get_timestamp_32b() { return reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); } +void tt_fabric_add_header_checksum(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + p_header->packet_parameters.misc_parameters.words[0] = sum; +} + +bool tt_fabric_is_header_valid(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + return (p_header->packet_parameters.misc_parameters.words[0] == sum); +} + void zero_l1_buf(tt_l1_ptr uint32_t* buf, uint32_t size_bytes) { for (uint32_t i = 0; i < size_bytes / 4; i++) { buf[i] = 0; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index 98a4037fb29..6978f58ff9d 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -9,12 +9,17 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "tt_fabric_interface.h" +#include "tt_fabric.hpp" -extern volatile local_pull_request_t* local_pull_request; -extern volatile tt_fabric_client_interface_t* client_interface; +typedef struct tt_fabric_endpoint_sync { + uint32_t sync_addr : 24; + uint32_t endpoint_type : 8 +} tt_fabric_endpoint_sync_t; + +static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); + +extern local_pull_request_t local_pull_request; -/* inline void fabric_async_write( uint32_t routing_plane, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory @@ -35,125 +40,56 @@ inline void fabric_async_write( uint64_t router_request_queue = NOC_XY_ADDR(2, 0, 0x19000); tt_fabric_send_pull_request(router_request_queue, &local_pull_request); } -*/ -inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { - if (dst_mesh_id != routing_table[routing_plane].my_mesh_id) { - uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } else { - uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_dev_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } -} +/** + * Polling for ready signal from the remote peers of all input and output queues. + * Blocks until all are ready, but doesn't block polling on each individual queue. + * Returns false in case of timeout. + */ +bool wait_all_src_dest_ready(volatile router_state_t* router_state, uint32_t timeout_cycles = 0) { + bool src_ready = false; + bool dest_ready = false; -inline void send_message_to_gk() { - uint64_t gk_noc_base = client_interface->gk_msg_buf_addr; - uint64_t noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, wrptr); - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - 1, - FVCC_BUF_LOG_SIZE, - false, - false, - (uint32_t)&client_interface->wrptr.ptr); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); - uint32_t wrptr = client_interface->wrptr.ptr; - noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, rdptr); - while (1) { - noc_async_read_one_packet(noc_addr, (uint32_t)(&client_interface->rdptr.ptr), 4); - noc_async_read_barrier(); - if (!fvcc_buf_ptrs_full(wrptr, client_interface->rdptr.ptr)) { - break; - } - } - uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; - noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); - noc_async_write_one_packet((uint32_t)(&client_interface->gk_message), noc_addr, sizeof(packet_header_t), noc_index); - noc_async_write_barrier(); -} + uint32_t iters = 0; -inline socket_handle_t* fabric_socket_open( - uint32_t routing_plane, // the network plane to use for this socket - uint16_t epoch_id, // Temporal epoch for which the socket is being opened - uint16_t socket_id, // Socket Id to open - uint8_t socket_type, // Unicast, Multicast, SSocket, DSocket - uint8_t direction, // Send or Receive - uint16_t remote_mesh_id, // Remote mesh/device that is the socket data sender/receiver. - uint16_t remote_dev_id, - uint8_t fvc // fabric virtual channel. -) { - uint32_t socket_count = client_interface->socket_count; - socket_handle_t* socket_handle = (socket_handle_t*)&client_interface->socket_handles[socket_count]; - socket_count++; - client_interface->socket_count = socket_count; - socket_handle->socket_state = SocketState::OPENING; + uint32_t start_timestamp = get_timestamp_32b(); + uint32_t sync_in_addr = ((uint32_t)&router_state->sync_in) / PACKET_WORD_SIZE_BYTES; + uint32_t sync_out_addr = ((uint32_t)&router_state->sync_out) / PACKET_WORD_SIZE_BYTES; - if (direction == SOCKET_DIRECTION_SEND) { - client_interface->gk_message.packet_header.routing.dst_mesh_id = remote_mesh_id; - client_interface->gk_message.packet_header.routing.dst_dev_id = remote_dev_id; - } else { - client_interface->gk_message.packet_header.routing.src_mesh_id = remote_mesh_id; - client_interface->gk_message.packet_header.routing.src_dev_id = remote_dev_id; - } - client_interface->gk_message.packet_header.routing.flags = SYNC; - client_interface->gk_message.packet_header.session.command = SOCKET_OPEN; - client_interface->gk_message.packet_header.session.target_offset_h = client_interface->pull_req_buf_addr >> 32; - client_interface->gk_message.packet_header.session.target_offset_l = (uint32_t)client_interface->pull_req_buf_addr; - client_interface->gk_message.packet_header.session.ack_offset_h = xy_local_addr >> 32; - client_interface->gk_message.packet_header.session.ack_offset_l = (uint32_t)socket_handle; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_id = socket_id; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.epoch_id = epoch_id; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_type = socket_type; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_direction = direction; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.routing_plane = routing_plane; - tt_fabric_add_header_checksum((packet_header_t*)&client_interface->gk_message.packet_header); - send_message_to_gk(); - return socket_handle; -} + uint32_t scratch_addr = ((uint32_t)&router_state->scratch) / PACKET_WORD_SIZE_BYTES; + router_state->scratch[0] = 0xAA; + // send_buf[1] = 0x0; + // send_buf[2] = 0x0; + // send_buf[3] = 0x0; -inline void fabric_socket_close(socket_handle_t* socket_handle) { - packet_header_t* packet_header = (packet_header_t*)&client_interface->gk_message.packet_header; - uint32_t dst_mesh_id = socket_handle->rcvr_mesh_id; - uint32_t dst_dev_id = socket_handle->rcvr_dev_id; - packet_header->routing.flags = INLINE_FORWARD; - packet_header->routing.dst_mesh_id = dst_mesh_id; - packet_header->routing.dst_dev_id = dst_dev_id; - packet_header->routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - packet_header->session.command = SOCKET_CLOSE; - packet_header->session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; - packet_header->session.target_offset_h = socket_handle->pull_notification_adddr >> 32; - tt_fabric_add_header_checksum(packet_header); + while (!src_ready or !dest_ready) { + if (router_state->sync_out != 0xAA) { + internal_::eth_send_packet(0, scratch_addr, sync_in_addr, 1); + } else { + dest_ready = true; + } - uint32_t* dst = (uint32_t*)&client_interface->local_pull_request.pull_request; - uint32_t* src = (uint32_t*)packet_header; - for (uint32_t i = 0; i < sizeof(pull_request_t) / 4; i++) { - dst[i] = src[i]; - } - uint64_t dest_addr = - ((uint64_t)get_next_hop_router_noc_xy(socket_handle->routing_plane, dst_mesh_id, dst_dev_id) << 32) | - FABRIC_ROUTER_REQ_QUEUE_START; - tt_fabric_send_pull_request(dest_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); -} + if (!src_ready && router_state->sync_in == 0xAA) { + internal_::eth_send_packet(0, sync_in_addr, sync_out_addr, 1); + src_ready = true; + } -inline void fabric_socket_connect(socket_handle_t* socket_handle) { - // wait for socket state to change to Active. - // Gatekeeper will update local socket handle when the receiver for send socket - // is ready. - while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); -} + iters++; + if (timeout_cycles > 0) { + uint32_t cycles_since_start = get_timestamp_32b() - start_timestamp; + if (cycles_since_start > timeout_cycles) { + return false; + } + } -inline void fabric_endpoint_init() { - uint64_t noc_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, ep_sync); - client_interface->return_status[0] = 0; - while (1) { - noc_async_read_one_packet(noc_addr, (uint32_t)&client_interface->return_status[0], 4); - noc_async_read_barrier(); - if (client_interface->return_status[0] != 0) { - break; +#if defined(COMPILE_FOR_ERISC) + if ((timeout_cycles == 0) && (iters & 0xFFF) == 0) { + // if timeout is disabled, context switch every 4096 iterations. + // this is necessary to allow ethernet routing layer to operate. + // this core may have pending ethernet routing work. + internal_::risc_context_switch(); } +#endif } + return true; } diff --git a/tt_fabric/hw/inc/tt_fabric_interface.h b/tt_fabric/hw/inc/tt_fabric_interface.h index f84c859dad3..affefa22e20 100644 --- a/tt_fabric/hw/inc/tt_fabric_interface.h +++ b/tt_fabric/hw/inc/tt_fabric_interface.h @@ -16,7 +16,8 @@ static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t NUM_WR_CMD_BUFS = 4; -constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; +constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = + (NUM_WR_CMD_BUFS - 1) * (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024; constexpr uint32_t FVC_SYNC_THRESHOLD = 256; @@ -29,9 +30,6 @@ enum SessionCommand : uint32_t { SSOCKET_WR = (0x1 << 5), ATOMIC_INC = (0x1 << 6), ATOMIC_READ_INC = (0x1 << 7), - SOCKET_OPEN = (0x1 << 8), - SOCKET_CLOSE = (0x1 << 9), - SOCKET_CONNECT = (0x1 << 10), }; #define INVALID 0x0 @@ -78,13 +76,7 @@ typedef struct mcast_params { } mcast_params; typedef struct socket_params { - uint32_t padding1; - uint16_t socket_id; - uint16_t epoch_id; - uint8_t socket_type; - uint8_t socket_direction; - uint8_t routing_plane; - uint8_t padding; + uint32_t socket_id; } socket_params; typedef struct atomic_params { @@ -125,28 +117,6 @@ const uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD static_assert(sizeof(packet_header) == PACKET_HEADER_SIZE_BYTES); -void tt_fabric_add_header_checksum(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - p_header->packet_parameters.misc_parameters.words[0] = sum; -} - -bool tt_fabric_is_header_valid(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - return (p_header->packet_parameters.misc_parameters.words[0] == sum); -} - // This is a pull request entry for a fabric router. // Pull request issuer populates these entries to identify // the data that fabric router needs to pull from requestor. @@ -218,127 +188,11 @@ typedef struct chan_payload_ptr { static_assert(sizeof(chan_payload_ptr) == CHAN_PTR_SIZE_BYTES); -// Fabric Virtual Control Channel (FVCC) parameters. -// Each control channel message is 48 Bytes. -// FVCC buffer is a 16 message buffer each for incoming and outgoing messages. -// Control message capacity can be increased by increasing FVCC_BUF_SIZE. -const uint32_t FVCC_BUF_SIZE = 16; // must be 2^N -const uint32_t FVCC_BUF_LOG_SIZE = 4; // must be log2(FVCC_BUF_SIZE) -const uint32_t FVCC_SIZE_MASK = (FVCC_BUF_SIZE - 1); -const uint32_t FVCC_PTR_MASK = ((FVCC_BUF_SIZE << 1) - 1); -const uint32_t FVCC_BUF_SIZE_BYTES = PULL_REQ_SIZE_BYTES * FVCC_BUF_SIZE + 2 * CHAN_PTR_SIZE_BYTES; -const uint32_t FVCC_SYNC_BUF_SIZE_BYTES = CHAN_PTR_SIZE_BYTES * FVCC_BUF_SIZE; - -inline bool fvcc_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } - -inline bool fvcc_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { - uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; - return !fvcc_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); -} - -// out_req_buf has 16 byte additional storage per entry to hold the outgoing -// write pointer update. this is sent over ethernet. -// For incoming requests over ethernet, we only need storate for the request -// entry. The pointer update goes to fvcc state. -typedef struct ctrl_chan_msg_buf { - chan_ptr wrptr; - chan_ptr rdptr; - chan_request_entry_t msg_buf[FVCC_BUF_SIZE]; -} ctrl_chan_msg_buf; - -typedef struct ctrl_chan_sync_buf { - chan_payload_ptr ptr[FVCC_BUF_SIZE]; -} ctrl_chan_sync_buf; - -static_assert(sizeof(ctrl_chan_msg_buf) == FVCC_BUF_SIZE_BYTES); - -typedef struct sync_word { - uint32_t val; - uint32_t padding[3]; -} sync_word_t; - -typedef struct gatekeeper_info { - sync_word_t router_sync; - sync_word_t ep_sync; - uint32_t routing_planes; - uint32_t padding[3]; - ctrl_chan_msg_buf gk_msg_buf; -} gatekeeper_info_t; - -#define SOCKET_DIRECTION_SEND 1 -#define SOCKET_DIRECTION_RECV 2 -#define SOCKET_TYPE_DGRAM 1 -#define SOCKET_TYPE_STREAM 2 - -enum SocketState : uint8_t { - IDLE = 0, - OPENING = 1, - ACTIVE = 2, - CLOSING = 3, -}; - -typedef struct socket_handle { - uint16_t socket_id; - uint16_t epoch_id; - uint8_t socket_state; - uint8_t socket_type; - uint8_t socket_direction; - uint8_t rcvrs_ready; - uint32_t routing_plane; - uint16_t sender_mesh_id; - uint16_t sender_dev_id; - uint16_t rcvr_mesh_id; - uint16_t rcvr_dev_id; - uint32_t sender_handle; - uint64_t pull_notification_adddr; - uint64_t status_notification_addr; - uint32_t padding[2]; -} socket_handle_t; - -static_assert(sizeof(socket_handle_t) % 16 == 0); - -constexpr uint32_t MAX_SOCKETS = 64; -typedef struct socket_info { - uint32_t socket_count; - uint32_t socket_setup_pending; - uint32_t padding[2]; - socket_handle_t sockets[MAX_SOCKETS]; - chan_ptr wrptr; - chan_ptr rdptr; - chan_request_entry_t gk_message; -} socket_info_t; -static_assert(sizeof(socket_info_t) % 16 == 0); - -typedef struct tt_fabric_client_interface { - uint64_t gk_interface_addr; - uint64_t gk_msg_buf_addr; - uint64_t pull_req_buf_addr; - uint32_t padding[2]; - uint32_t return_status[3]; - uint32_t socket_count; - chan_ptr wrptr; - chan_ptr rdptr; - chan_request_entry_t gk_message; - local_pull_request_t local_pull_request; - socket_handle_t socket_handles[MAX_SOCKETS]; -} tt_fabric_client_interface_t; - -static_assert(sizeof(tt_fabric_client_interface_t) % 16 == 0); - constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM = FABRIC_ROUTER_MISC_START; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM_SIZE = 16; -// Fabric Virtual Control Channel start/size -constexpr uint32_t FVCC_OUT_BUF_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; -constexpr uint32_t FVCC_OUT_BUF_SIZE = FVCC_BUF_SIZE_BYTES; -constexpr uint32_t FVCC_SYNC_BUF_START = FVCC_OUT_BUF_START + FVCC_OUT_BUF_SIZE; -constexpr uint32_t FVCC_SYNC_BUF_SIZE = FVCC_SYNC_BUF_SIZE_BYTES; -constexpr uint32_t FVCC_IN_BUF_START = FVCC_SYNC_BUF_START + FVCC_SYNC_BUF_SIZE; -constexpr uint32_t FVCC_IN_BUF_SIZE = FVCC_BUF_SIZE_BYTES; - -// Fabric Virtual Channel start/size -constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FVCC_IN_BUF_START + FVCC_IN_BUF_SIZE; +constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_SIZE = sizeof(chan_req_buf); constexpr uint32_t FABRIC_ROUTER_DATA_BUF_START = FABRIC_ROUTER_REQ_QUEUE_START + FABRIC_ROUTER_REQ_QUEUE_SIZE; diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp deleted file mode 100644 index cf88eb6c7e5..00000000000 --- a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp +++ /dev/null @@ -1,506 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "debug/dprint.h" -// clang-format on - -constexpr uint32_t gatekeeper_info_addr = get_compile_time_arg_val(0); -constexpr uint32_t socket_info_addr = get_compile_time_arg_val(1); -constexpr uint32_t routing_table_addr = get_compile_time_arg_val(2); -constexpr uint32_t kernel_status_buf_addr = get_compile_time_arg_val(3); -constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(4); -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(5); -uint32_t sync_val; -uint32_t router_mask; - -constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; -constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; -constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; -constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; -constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; -constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; - -// indexes of return values in test results buffer -constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; -constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; -constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; -constexpr uint32_t PQ_TEST_ITER_INDEX = 6; -constexpr uint32_t PQ_TEST_MISC_INDEX = 16; - -// careful, may be null -tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr); -volatile tt_l1_ptr tt::tt_fabric::fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_addr); - -volatile gatekeeper_info_t* gk_info = (volatile gatekeeper_info_t*)gatekeeper_info_addr; -volatile socket_info_t* socket_info = (volatile socket_info_t*)socket_info_addr; - -uint64_t xy_local_addr; -uint64_t router_addr; -uint32_t gk_message_pending; - -inline void notify_all_routers(uint32_t notification) { - uint32_t remaining_cores = router_mask; - for (uint32_t i = 0; i < 16; i++) { - if (remaining_cores == 0) { - break; - } - if (remaining_cores & (0x1 << i)) { - uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; - noc_inline_dw_write(dest_addr, notification); - remaining_cores &= ~(0x1 << i); - } - } -} - -inline void sync_all_routers() { - // wait for all device routers to have incremented the sync semaphore. - // sync_val is equal to number of tt-fabric routers running on a device. - while (gk_info->router_sync.val != sync_val); - - // send semaphore increment to all fabric routers on this device. - // semaphore notifies all other routers that this router has completed - // startup handshake with its ethernet peer. - notify_all_routers(sync_val); - gk_info->ep_sync.val = sync_val; -} - -inline void gk_msg_buf_ptr_advance(chan_ptr* ptr) { ptr->ptr = (ptr->ptr + 1) & FVCC_PTR_MASK; } - -inline void gk_msg_buf_advance_wrptr(ctrl_chan_msg_buf* msg_buf) { gk_msg_buf_ptr_advance(&(msg_buf->wrptr)); } - -inline void gk_msg_buf_advance_rdptr(ctrl_chan_msg_buf* msg_buf) { - // clear valid before incrementing read pointer. - uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; - msg_buf->msg_buf[rd_index].bytes[47] = 0; - gk_msg_buf_ptr_advance(&(msg_buf->rdptr)); -} - -inline bool gk_msg_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } - -inline bool gk_msg_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { - uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; - return !gk_msg_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); -} - -inline bool gk_msg_buf_is_empty(const volatile ctrl_chan_msg_buf* msg_buf) { - return gk_msg_buf_ptrs_empty(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); -} - -inline bool gk_msg_buf_is_full(const volatile ctrl_chan_msg_buf* msg_buf) { - return gk_msg_buf_ptrs_full(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); -} - -inline bool gk_msg_valid(const volatile ctrl_chan_msg_buf* msg_buf) { - uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; - return msg_buf->msg_buf[rd_index].packet_header.routing.flags != 0; -} - -inline socket_handle_t* socket_receiver_available(packet_header_t* packet) { - socket_handle_t* handle = 0; - bool sender_found = false; - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - if (socket_info->sockets[i].socket_state == SocketState::OPENING && - socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { - handle = (socket_handle_t*)&socket_info->sockets[i]; - if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { - continue; - } - if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { - continue; - } - if (handle->sender_dev_id != routing_table[0].my_device_id) { - continue; - } - if (handle->sender_mesh_id != routing_table[0].my_mesh_id) { - continue; - } - if (handle->rcvr_dev_id != packet->routing.dst_dev_id) { - continue; - } - if (handle->rcvr_mesh_id != packet->routing.dst_mesh_id) { - continue; - } - sender_found = true; - break; - } - } - return sender_found ? handle : 0; -} -inline void set_socket_active(socket_handle_t* handle) { - handle->socket_state = SocketState::ACTIVE; - // send socket connection state to send socket opener. - noc_async_write_one_packet( - (uint32_t)(handle), handle->status_notification_addr, sizeof(socket_handle_t), noc_index); -} - -inline void socket_open(packet_header_t* packet) { - socket_handle_t* handle = 0; - if (packet->packet_parameters.socket_parameters.socket_direction == SOCKET_DIRECTION_SEND) { - handle = socket_receiver_available(packet); - if (handle) { - // If remote receive socket already opened, - // set send socket state to active and return. - handle->status_notification_addr = - ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; - set_socket_active(handle); - DPRINT << "GK: Receiver Available " << (uint32_t)handle->socket_id << ENDL(); - return; - } - } - - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - if (socket_info->sockets[i].socket_state == 0) { - // idle socket handle. - socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; - handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; - handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; - handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; - handle->socket_direction = packet->packet_parameters.socket_parameters.socket_direction; - handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; - handle->status_notification_addr = - ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; - handle->socket_state = SocketState::OPENING; - - if (handle->socket_direction == SOCKET_DIRECTION_RECV) { - handle->pull_notification_adddr = - ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; - handle->sender_dev_id = packet->routing.src_dev_id; - handle->sender_mesh_id = packet->routing.src_mesh_id; - handle->rcvr_dev_id = routing_table[0].my_device_id; - handle->rcvr_mesh_id = routing_table[0].my_mesh_id; - socket_info->socket_setup_pending++; - } else { - handle->sender_dev_id = routing_table[0].my_device_id; - handle->sender_mesh_id = routing_table[0].my_mesh_id; - handle->rcvr_dev_id = packet->routing.dst_dev_id; - handle->rcvr_mesh_id = packet->routing.dst_mesh_id; - } - DPRINT << "GK: Socket Opened : " << (uint32_t)handle->socket_id << ENDL(); - break; - } - } -} - -inline void socket_close(packet_header_t* packet) { - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - bool found = socket_info->sockets[i].socket_id == packet->packet_parameters.socket_parameters.socket_id && - socket_info->sockets[i].epoch_id == packet->packet_parameters.socket_parameters.epoch_id; - if (found) { - socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; - // handle->socket_id = 0; - // handle->epoch_id = 0; - // handle->socket_type = 0; - // handle->socket_direction = 0; - handle->pull_notification_adddr = 0; - handle->socket_state = SocketState::CLOSING; - socket_info->socket_setup_pending++; - break; - } - } -} - -inline void socket_open_for_connect(packet_header_t* packet) { - // open a send socket in response to a connect message received from a - // socket receiver. - // Local device has not yet opened a send socket, but we need to update - // local socket state. - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - if (socket_info->sockets[i].socket_state == 0) { - // idle socket handle. - socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; - handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; - handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; - handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; - handle->socket_direction = SOCKET_DIRECTION_SEND; - handle->pull_notification_adddr = - ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; - handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; - handle->socket_state = SocketState::OPENING; - handle->sender_dev_id = routing_table[0].my_device_id; - handle->sender_mesh_id = routing_table[0].my_mesh_id; - handle->rcvr_dev_id = packet->routing.src_dev_id; - handle->rcvr_mesh_id = packet->routing.src_mesh_id; - DPRINT << "GK: Opened Send Socket for Remote " << (uint32_t)handle->socket_id << ENDL(); - - break; - } - } -} - -inline void socket_connect(packet_header_t* packet) { - // connect messsage is initiated by a receiving socket. - // find a matching send socket in local state. - bool found_sender = false; - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - if (socket_info->sockets[i].socket_state == SocketState::OPENING && - socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { - socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; - if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { - continue; - } - if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { - continue; - } - if (handle->sender_dev_id != packet->routing.dst_dev_id) { - continue; - } - if (handle->sender_mesh_id != packet->routing.dst_mesh_id) { - continue; - } - if (handle->rcvr_dev_id != packet->routing.src_dev_id) { - continue; - } - if (handle->rcvr_mesh_id != packet->routing.src_mesh_id) { - continue; - } - found_sender = true; - handle->pull_notification_adddr = - ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; - handle->socket_state = SocketState::ACTIVE; - set_socket_active(handle); - DPRINT << "GK: Found Send Socket " << (uint32_t)handle->socket_id << ENDL(); - break; - } - } - - if (!found_sender) { - // No send socket opened yet. - // Device has not made it to the epoch where a send socket is opened. - // Log the receive socket connect request. - socket_open_for_connect(packet); - } -} - -uint32_t get_next_hop_router_noc_xy(packet_header_t* current_packet_header, uint32_t routing_plane) { - uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; - if (dst_mesh_id != routing_table->my_mesh_id) { - uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } else { - uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; - uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_device_id]; - return eth_chan_to_noc_xy[noc_index][next_port]; - } -} - -inline bool send_gk_message(uint64_t dest_addr, packet_header_t* packet) { - uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - 1, - FVCC_BUF_LOG_SIZE, - false, - false, - (uint32_t)&socket_info->wrptr.ptr); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); - - uint32_t wrptr = socket_info->wrptr.ptr; - noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); - - noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); - noc_async_read_barrier(); - if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { - return false; - } - - uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; - noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); - noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); - return true; -} - -inline bool retry_gk_message(uint64_t dest_addr, packet_header_t* packet) { - uint32_t wrptr = socket_info->wrptr.ptr; - uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); - - noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); - noc_async_read_barrier(); - if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { - return false; - } - - uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; - noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); - noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); - gk_message_pending = 0; - return true; -} - -inline void process_pending_socket() { - if (socket_info->socket_setup_pending == 0) { - // there is no pending socket setup work. - // or there is a pending message to be sent out - return; - } - - // Check write flush of previous gk message write. - // TODO. - - chan_request_entry_t* message = (chan_request_entry_t*)&socket_info->gk_message; - - if (gk_message_pending == 1) { - if (retry_gk_message(router_addr, &message->packet_header)) { - // decrement pending count. - // if there is more pending socket work, will pick up on next iteration. - socket_info->socket_setup_pending--; - } - } else { - for (uint32_t i = 0; i < MAX_SOCKETS; i++) { - if (socket_info->sockets[i].socket_state == SocketState::OPENING && - socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_RECV) { - // send a message on fvcc to socket data sender. - socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; - message->packet_header.routing.dst_dev_id = handle->sender_dev_id; - message->packet_header.routing.dst_mesh_id = handle->sender_mesh_id; - message->packet_header.routing.src_dev_id = handle->rcvr_dev_id; - message->packet_header.routing.src_mesh_id = handle->rcvr_mesh_id; - message->packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - message->packet_header.routing.flags = SYNC; - message->packet_header.session.command = SOCKET_CONNECT; - message->packet_header.session.target_offset_h = handle->pull_notification_adddr >> 32; - message->packet_header.session.target_offset_l = (uint32_t)handle->pull_notification_adddr; - message->packet_header.packet_parameters.socket_parameters.socket_id = handle->socket_id; - message->packet_header.packet_parameters.socket_parameters.epoch_id = handle->epoch_id; - message->packet_header.packet_parameters.socket_parameters.socket_type = handle->socket_type; - message->packet_header.packet_parameters.socket_parameters.socket_direction = SOCKET_DIRECTION_RECV; - message->packet_header.packet_parameters.socket_parameters.routing_plane = handle->routing_plane; - tt_fabric_add_header_checksum((packet_header_t*)&message->packet_header); - - router_addr = - ((uint64_t)get_next_hop_router_noc_xy(&message->packet_header, handle->routing_plane) << 32) | - FVCC_OUT_BUF_START; - - if (send_gk_message(router_addr, &message->packet_header)) { - DPRINT << "GK: Sending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); - handle->socket_state = SocketState::ACTIVE; - socket_info->socket_setup_pending--; - } else { - DPRINT << "GK: Pending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); - // gatekeeper message buffer is full. need to retry on next iteration. - gk_message_pending = 1; - } - } - } - } -} - -inline void get_routing_tables() { - uint32_t temp_mask = router_mask; - uint32_t channel = 0; - uint32_t routing_plane = 0; - for (uint32_t i = 0; i < 4; i++) { - if (temp_mask & 0xF) { - temp_mask &= 0xF; - break; - } else { - temp_mask >>= 4; - } - channel += 4; - } - - if (temp_mask) { - for (uint32_t i = 0; i < 4; i++) { - if (temp_mask & 0x1) { - uint64_t router_config_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][channel] << 32) | - eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE; - noc_async_read_one_packet( - router_config_addr, - (uint32_t)&routing_table[routing_plane], - sizeof(tt::tt_fabric::fabric_router_l1_config_t)); - routing_plane++; - } - temp_mask >>= 1; - channel++; - } - } - gk_info->routing_planes = routing_plane; - noc_async_read_barrier(); -} - -void kernel_main() { - sync_val = get_arg_val(0); - router_mask = get_arg_val(1); - gk_message_pending = 0; - router_addr = 0; - - tt_fabric_init(); - - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); - - zero_l1_buf((tt_l1_ptr uint32_t*)&gk_info->gk_msg_buf, FVCC_BUF_SIZE_BYTES); - zero_l1_buf((tt_l1_ptr uint32_t*)socket_info, sizeof(socket_info_t)); - - sync_all_routers(); - get_routing_tables(); - uint64_t start_timestamp = get_timestamp(); - - uint32_t loop_count = 0; - uint32_t total_messages_procesed = 0; - volatile ctrl_chan_msg_buf* msg_buf = &gk_info->gk_msg_buf; - while (1) { - if (!gk_msg_buf_is_empty(msg_buf) && gk_msg_valid(msg_buf)) { - uint32_t msg_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; - chan_request_entry_t* req = (chan_request_entry_t*)msg_buf->msg_buf + msg_index; - packet_header_t* packet = &req->packet_header; - if (tt_fabric_is_header_valid(packet)) { - total_messages_procesed++; - DPRINT << "GK: Message Received " << (uint32_t)packet->session.command - << " msg num = " << total_messages_procesed << ENDL(); - if (packet->routing.flags == SYNC) { - if (packet->session.command == SOCKET_OPEN) { - DPRINT << "GK: Socket Open " << ENDL(); - socket_open(packet); - } else if (packet->session.command == SOCKET_CLOSE) { - socket_close(packet); - } else if (packet->session.command == SOCKET_CONNECT) { - DPRINT << "GK: Socket Connect " << ENDL(); - socket_connect(packet); - } - } - - // clear the flags field to invalidate pull request slot. - // flags will be set to non-zero by next requestor. - gk_msg_buf_advance_rdptr((ctrl_chan_msg_buf*)msg_buf); - loop_count = 0; - } else { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_BAD_HEADER); - return; - } - } - - loop_count++; - - process_pending_socket(); - - if (gk_info->router_sync.val == 0) { - // terminate signal from host sw. - if (loop_count >= 0x1000) { - // send terminate to all chip routers. - notify_all_routers(0); - break; - } - } - } - - DPRINT << "Gatekeeper messages processed " << total_messages_procesed << ENDL(); - - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); - - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); - - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); - - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); -} diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp index ef966d46984..847002383f2 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp @@ -4,14 +4,12 @@ // clang-format off #include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "hw/inc/tt_fabric.h" // clang-format on router_state_t router_state __attribute__((aligned(16))); fvc_consumer_state_t fvc_consumer_state __attribute__((aligned(16))); // replicate for each fvc fvc_producer_state_t fvc_producer_state __attribute__((aligned(16))); // replicate for each fvc -fvcc_inbound_state_t fvcc_inbound_state __attribute__((aligned(16))); // inbound fabric virtual control channel -fvcc_outbound_state_t fvcc_outbound_state __attribute__((aligned(16))); // outbound fabric virtual control channel volatile local_pull_request_t local_pull_request_temp __attribute__((aligned(16))); // replicate for each fvc volatile local_pull_request_t* local_pull_request = &local_pull_request_temp; // replicate for each fvc @@ -19,11 +17,11 @@ constexpr uint32_t fvc_data_buf_size_words = get_compile_time_arg_val(0); constexpr uint32_t fvc_data_buf_size_bytes = fvc_data_buf_size_words * PACKET_WORD_SIZE_BYTES; constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(1); constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(2); +// constexpr uint32_t sync_val = get_compile_time_arg_val(3); +// constexpr uint32_t router_mask = get_compile_time_arg_val(4); constexpr uint32_t timeout_cycles = get_compile_time_arg_val(3); uint32_t sync_val; uint32_t router_mask; -uint32_t gk_message_addr_l; -uint32_t gk_message_addr_h; constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; @@ -50,23 +48,30 @@ uint64_t xy_local_addr; #define SWITCH_THRESHOLD 0x3FF -inline void notify_gatekeeper() { - // send semaphore increment to gatekeeper on this device. +inline void notify_all_routers() { + uint32_t channel = 0; + uint32_t remaining_cores = router_mask; + // send semaphore increment to all fabric routers on this device. // semaphore notifies all other routers that this router has completed // startup handshake with its ethernet peer. - uint64_t dest_addr = - (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, router_sync); - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - dest_addr, - NOC_UNICAST_WRITE_VC, - 1, - 31, - false, - false, - MEM_NOC_ATOMIC_RET_VAL_ADDR); - + for (uint32_t i = 0; i < 16; i++) { + if (remaining_cores == 0) { + break; + } + if (remaining_cores & (0x1 << i)) { + uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + dest_addr, + NOC_UNICAST_WRITE_VC, + 1, + 31, + false, + false, + MEM_NOC_ATOMIC_RET_VAL_ADDR); + } + } volatile uint32_t* sync_sem_addr = (volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM; // wait for all device routers to have incremented the sync semaphore. // sync_val is equal to number of tt-fabric routers running on a device. @@ -79,11 +84,8 @@ inline void notify_gatekeeper() { void kernel_main() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; - uint32_t rt_args_idx = 0; - sync_val = get_arg_val(rt_args_idx++); - router_mask = get_arg_val(rt_args_idx++); - gk_message_addr_l = get_arg_val(rt_args_idx++); - gk_message_addr_h = get_arg_val(rt_args_idx++); + sync_val = get_arg_val(0); + router_mask = get_arg_val(1); tt_fabric_init(); @@ -97,8 +99,6 @@ void kernel_main() { router_state.sync_out = 0; zero_l1_buf((tt_l1_ptr uint32_t*)fvc_consumer_req_buf, sizeof(chan_req_buf)); - zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_IN_BUF_START, FVCC_IN_BUF_SIZE); - zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_OUT_BUF_START, FVCC_OUT_BUF_SIZE); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX, (uint32_t)&router_state); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX + 1, (uint32_t)&fvc_consumer_state); write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX + 1, (uint32_t)&fvc_producer_state); @@ -110,19 +110,12 @@ void kernel_main() { fvc_data_buf_size_words / 2, (uint32_t)&fvc_consumer_state.remote_rdptr); - fvcc_outbound_state.init( - FVCC_OUT_BUF_START, FVCC_SYNC_BUF_START, FVCC_IN_BUF_START, (uint32_t)&fvcc_inbound_state.inbound_wrptr); - fvcc_inbound_state.init( - FVCC_IN_BUF_START, - (uint32_t)&fvcc_outbound_state.remote_rdptr, - (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf)); - if (!wait_all_src_dest_ready(&router_state, timeout_cycles)) { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } - notify_gatekeeper(); + notify_all_routers(); uint64_t start_timestamp = get_timestamp(); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000001); @@ -202,9 +195,6 @@ void kernel_main() { return; } - fvcc_inbound_state.fvcc_handler(); - fvcc_outbound_state.fvcc_handler(); - loop_count++; // need to optimize this. diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index d345637891c..3ce95f1beba 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -33,7 +33,7 @@ struct address_map { static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = 0; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2064; + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2056; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = 0; diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index e28c477a8a2..faa1814d985 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -63,7 +63,7 @@ struct address_map { static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2064; + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2056; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = (FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_SIZE + 31) & ~31; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE;