diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index d7e9e9598ae..775f93b1861 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -20,6 +20,36 @@ class MultiDeviceFixture : public DispatchFixture { void SetUp() override { this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); } }; +class TwoDeviceFixture : public MultiDeviceFixture { +protected: + void SetUp() override { + this->slow_dispatch_ = true; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (!slow_dispatch) { + tt::log_info(tt::LogTest, "This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); + this->slow_dispatch_ = false; + GTEST_SKIP(); + } + + MultiDeviceFixture::SetUp(); + + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + const size_t num_pci_devices = tt::tt_metal::GetNumPCIeDevices(); + if (num_devices == 2) { + std::vector ids; + for (chip_id_t id = 0; id < num_devices; id++) { + ids.push_back(id); + } + + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); + tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); + this->devices_ = tt::DevicePool::instance().get_all_active_devices(); + } else { + GTEST_SKIP() << "TwoDeviceFixture can only be run on machines with two devices"; + } + } +}; + class N300DeviceFixture : public MultiDeviceFixture { protected: void SetUp() override { diff --git a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp index e4d2c50869e..0f3619b93f9 100644 --- a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp @@ -300,11 +300,14 @@ bool chip_to_chip_interleaved_buffer_transfer( } // namespace unit_tests::erisc::kernels -TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) { const auto& sender_device = devices_.at(0); const auto& receiver_device = devices_.at(1); for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_eth_core)) { + continue; + } CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( @@ -338,11 +341,14 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) { } } -TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip1ToChip0) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsSendDramBufferChip1ToChip0) { const auto& sender_device = devices_.at(1); const auto& receiver_device = devices_.at(0); for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_eth_core)) { + continue; + } CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( @@ -452,6 +458,9 @@ TEST_F(DeviceFixture, ActiveEthKernelsSendInterleavedBufferAllConnectedChips) { continue; } for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_eth_core)) { + continue; + } auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); if (receiver_device->id() != device_id) { continue; @@ -521,6 +530,9 @@ TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendDramBufferAllC continue; } for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_eth_core)) { + continue; + } auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); if (receiver_device->id() != device_id) { continue; diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index 9c96515a0f1..c3657b3e012 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -517,6 +517,9 @@ TEST_F(DeviceFixture, ActiveEthKernelsDirectSendAllConnectedChips) { continue; } for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_core)) { + continue; + } auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); if (receiver_device->id() != device_id) { continue; @@ -562,7 +565,7 @@ TEST_F(DeviceFixture, ActiveEthKernelsDirectSendAllConnectedChips) { } } -TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { using namespace CMAKE_UNIQUE_NAMESPACE; const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -571,6 +574,9 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( static_cast(this), @@ -592,6 +598,9 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { sender_core)); } for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( static_cast(this), @@ -613,6 +622,9 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { sender_core)); } for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( static_cast(this), @@ -634,6 +646,9 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { sender_core)); } for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( static_cast(this), @@ -656,7 +671,7 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { } } -TEST_F(N300DeviceFixture, ActiveEthKernelsRepeatedDirectSends) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsRepeatedDirectSends) { using namespace CMAKE_UNIQUE_NAMESPACE; const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -665,6 +680,9 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsRepeatedDirectSends) { const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); for (int i = 0; i < 10; i++) { ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( @@ -691,7 +709,7 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsRepeatedDirectSends) { } } -TEST_F(N300DeviceFixture, ActiveEthKernelsRandomDirectSendTests) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsRandomDirectSendTests) { using namespace CMAKE_UNIQUE_NAMESPACE; srand(0); const auto& device_0 = devices_.at(0); @@ -712,6 +730,11 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsRandomDirectSendTests) { const auto& send_chip = devices_.at(std::get<0>(it->first)); CoreCoord sender_core = std::get<1>(it->first); + + if (not tt::Cluster::instance().is_ethernet_link_up(send_chip->id(), sender_core)) { + continue; + } + const auto& receiver_chip = devices_.at(std::get<0>(it->second)); CoreCoord receiver_core = std::get<1>(it->second); @@ -736,17 +759,23 @@ TEST_F(N300DeviceFixture, ActiveEthKernelsRandomDirectSendTests) { receiver_core)); } } -TEST_F(N300DeviceFixture, ActiveEthKernelsRandomEthPacketSizeDirectSendTests) { +TEST_F(TwoDeviceFixture, ActiveEthKernelsRandomEthPacketSizeDirectSendTests) { srand(0); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); std::map, std::tuple> connectivity = {}; for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_0->id(), sender_core)) { + continue; + } const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); connectivity.insert({{0, sender_core}, receiver_core}); } for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(device_1->id(), sender_core)) { + continue; + } const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); connectivity.insert({{1, sender_core}, receiver_core}); } @@ -799,6 +828,9 @@ TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsDirectSendAllConne continue; } for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { + if (not tt::Cluster::instance().is_ethernet_link_up(sender_device->id(), sender_core)) { + continue; + } auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); if (receiver_device->id() != device_id) { continue; diff --git a/tt_metal/api/tt-metalium/dev_msgs.h b/tt_metal/api/tt-metalium/dev_msgs.h index 92e1427e47d..66de66cefa3 100644 --- a/tt_metal/api/tt-metalium/dev_msgs.h +++ b/tt_metal/api/tt-metalium/dev_msgs.h @@ -308,9 +308,9 @@ struct addressable_core_t { // This is the number of Ethernet cores on WH (Ethernet cores can be queried through Virtual Coordinates). // All other Non Worker Cores are not accessible through virtual coordinates. Subject to change, depending on the arch. constexpr static std::uint32_t MAX_VIRTUAL_NON_WORKER_CORES = 18; -// This is the total number of Non Worker Cores on WH (first term is Ethernet, second term is PCIe and last term is +// This is the total number of Non Worker Cores on WH (first term is DRAM, second term is PCIe and last term is // DRAM). -constexpr static std::uint32_t MAX_NON_WORKER_CORES = MAX_VIRTUAL_NON_WORKER_CORES + 1 + 16; +constexpr static std::uint32_t MAX_NON_WORKER_CORES = 24 + 1 + 14; constexpr static std::uint32_t MAX_HARVESTED_ROWS = 2; constexpr static std::uint8_t CORE_COORD_INVALID = 0xFF; struct core_info_msg_t { diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 40499b619f2..04401ab0b8e 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -85,8 +85,12 @@ class Device : public IDevice { // Ethernet API CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const override; CoreCoord logical_core_from_ethernet_core(const CoreCoord ðernet_core) const override; + // `skip_reserved_tunnel_cores` is ignored on BH because there are no ethernet cores used for Fast Dispatch + // tunneling std::unordered_set get_active_ethernet_cores(bool skip_reserved_tunnel_cores=false) const override; std::unordered_set get_inactive_ethernet_cores() const override; + // `skip_reserved_tunnel_cores` is ignored on BH because there are no ethernet cores used for Fast Dispatch + // tunneling bool is_active_ethernet_core(CoreCoord logical_core, bool skip_reserved_tunnel_cores=false) const override; std::tuple get_connected_ethernet_core(CoreCoord eth_core) const override; std::vector get_ethernet_sockets(chip_id_t connected_chip_id) const override; diff --git a/tt_metal/api/tt-metalium/hal.hpp b/tt_metal/api/tt-metalium/hal.hpp index 3dafb8bcbf1..d9b7c42a302 100644 --- a/tt_metal/api/tt-metalium/hal.hpp +++ b/tt_metal/api/tt-metalium/hal.hpp @@ -159,6 +159,7 @@ class Hal { bool coordinate_virtualization_enabled_; uint32_t virtual_worker_start_x_; uint32_t virtual_worker_start_y_; + bool eth_fw_is_cooperative_; // set when eth riscs have to context switch float eps_ = 0.0f; float nan_ = 0.0f; @@ -226,6 +227,7 @@ class Hal { bool is_coordinate_virtualization_enabled() const { return this->coordinate_virtualization_enabled_; }; std::uint32_t get_virtual_worker_start_x() const { return this->virtual_worker_start_x_; } std::uint32_t get_virtual_worker_start_y() const { return this->virtual_worker_start_y_; } + bool get_eth_fw_is_cooperative() const { return this->eth_fw_is_cooperative_; } uint32_t get_programmable_core_type_count() const; HalProgrammableCoreType get_programmable_core_type(uint32_t core_type_index) const; uint32_t get_programmable_core_type_index(HalProgrammableCoreType programmable_core_type_index) const; diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index 0535250dd26..29a8d71582e 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -160,8 +160,8 @@ CoordSystem metal_SocDescriptor::get_umd_coord_system() const { } void metal_SocDescriptor::generate_logical_eth_coords_mapping() { - for (int i = 0; i < this->get_cores(CoreType::ETH).size(); i++) { - this->logical_eth_core_to_chan_map.insert({{0, i}, i}); + for (const auto& logical_coord : this->get_cores(CoreType::ETH, CoordSystem::LOGICAL)) { + this->logical_eth_core_to_chan_map.insert({{logical_coord.x, logical_coord.y}, logical_coord.y}); } } diff --git a/tt_metal/hw/firmware/src/active_erisc.cc b/tt_metal/hw/firmware/src/active_erisc.cc index 448144b0b0d..fd3f02e15a7 100644 --- a/tt_metal/hw/firmware/src/active_erisc.cc +++ b/tt_metal/hw/firmware/src/active_erisc.cc @@ -121,6 +121,8 @@ int main() { flush_erisc_icache(); + firmware_config_init(mailboxes, ProgrammableCoreType::ACTIVE_ETH, DISPATCH_CLASS_ETH_DM0); + enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)launch_msg_address->kernel_config.enables; diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 71cae1833ab..d2ae89d10be 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -412,8 +412,14 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC case HalProgrammableCoreType::ACTIVE_ETH: case HalProgrammableCoreType::IDLE_ETH: { bool is_idle_eth = core_type == HalProgrammableCoreType::IDLE_ETH; - if (is_idle_eth) { - tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_core)); + TensixSoftResetOptions reset_val = TENSIX_ASSERT_SOFT_RESET; + if (not is_idle_eth) { + reset_val = + reset_val & static_cast( + ~std::underlying_type::type(TensixSoftResetOptions::BRISC)); + } + if (is_idle_eth or !hal.get_eth_fw_is_cooperative()) { + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_core), reset_val); } if (not llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { @@ -488,23 +494,27 @@ void Device::reset_cores() { std::unordered_map> dispatch_cores, other_dispatch_cores, device_to_early_exit_cores; go_msg_t go_msg; std::memset(&go_msg, 0, sizeof(go_msg_t)); - for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); - if (erisc_app_still_running(virtual_core)) { - std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); - DeviceAddr launch_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH); - - data = tt::llrt::read_hex_vec_from_core(this->id(), virtual_core, launch_addr, sizeof(launch_msg_t)); - launch_msg_t* launch_msg = (launch_msg_t*)(&data[0]); - log_info( - tt::LogMetal, - "While initializing Device {}, ethernet tunneler core {} on Device {} detected as still running, issuing exit signal.", - this->id(), - virtual_core.str(), - this->id()); - launch_msg->kernel_config.exit_erisc_kernel = 1; - llrt::write_launch_msg_to_core(this->id(), virtual_core, launch_msg, &go_msg, launch_addr, false); - device_to_early_exit_cores[this->id()].insert(virtual_core); + if (hal.get_eth_fw_is_cooperative()) { + for (const auto& eth_core : this->get_active_ethernet_cores()) { + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); + if (erisc_app_still_running(virtual_core)) { + std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); + DeviceAddr launch_addr = + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH); + + data = tt::llrt::read_hex_vec_from_core(this->id(), virtual_core, launch_addr, sizeof(launch_msg_t)); + launch_msg_t* launch_msg = (launch_msg_t*)(&data[0]); + log_info( + tt::LogMetal, + "While initializing Device {}, ethernet tunneler core {} on Device {} detected as still running, " + "issuing exit signal.", + this->id(), + virtual_core.str(), + this->id()); + launch_msg->kernel_config.exit_erisc_kernel = 1; + llrt::write_launch_msg_to_core(this->id(), virtual_core, launch_msg, &go_msg, launch_addr, false); + device_to_early_exit_cores[this->id()].insert(virtual_core); + } } } @@ -692,12 +702,17 @@ void Device::initialize_and_launch_firmware() { this->id(), virtual_core, zero_vec_erisc_init, hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::APP_SYNC_INFO)); } - // Load erisc app base FW to eth cores + // Load erisc app base FW to eth cores on WH and active_erisc FW on second risc of BH active eth cores + std::unordered_set active_eth_cores; for (const auto ð_core : this->get_active_ethernet_cores()) { CoreCoord phys_eth_core = this->ethernet_core_from_logical_core(eth_core); tt::llrt::write_hex_vec_to_core( this->id(), phys_eth_core, core_info_vec, this->get_dev_addr(phys_eth_core, HalL1MemAddrType::CORE_INFO)); this->initialize_firmware(HalProgrammableCoreType::ACTIVE_ETH, phys_eth_core, &launch_msg, &go_msg); + if (!hal.get_eth_fw_is_cooperative()) { + active_eth_cores.insert(phys_eth_core); + not_done_cores.insert(phys_eth_core); + } } for (const auto ð_core : this->get_inactive_ethernet_cores()) { @@ -712,8 +727,18 @@ void Device::initialize_and_launch_firmware() { tt::Cluster::instance().l1_barrier(this->id()); // Deassert worker cores - for(const auto& worker_core : not_done_cores) - tt::Cluster::instance().deassert_risc_reset_at_core(tt_cxy_pair(this->id(), worker_core)); + TensixSoftResetOptions reset_val; + for (const auto& worker_core : not_done_cores) { + if (active_eth_cores.find(worker_core) != active_eth_cores.end()) { + // bit 12 needs to be deasserted to run second erisc on BH + reset_val = TENSIX_DEASSERT_SOFT_RESET & + static_cast( + ~std::underlying_type::type(TensixSoftResetOptions::TRISC0)); + } else { + reset_val = TENSIX_DEASSERT_SOFT_RESET; + } + tt::Cluster::instance().deassert_risc_reset_at_core(tt_cxy_pair(this->id(), worker_core), reset_val); + } // Wait until fw init is done, ensures the next launch msg doesn't get // written while fw is still in init @@ -744,21 +769,25 @@ void Device::clear_l1_state() { // These L1 ranges are restricted becase UMD base routing FW uses L1 below FIRMWARE_BASE and // between TILE_HEADER_BUFFER_BASE to COMMAND_Q_BASE // Clear erisc sync info - for (const auto ð_core : this->get_active_ethernet_cores()) { - - static const uint32_t max_l1_loading_size = hal.get_dev_size(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED) + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED); - - static std::vector zero_vec_above_tile_header_buffer( - (max_l1_loading_size - hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::TILE_HEADER_BUFFER)) / sizeof(uint32_t), 0); + for (const auto& eth_core : this->get_active_ethernet_cores()) { + static const uint32_t max_l1_loading_size = + hal.get_dev_size(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED) + + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED); + + static uint32_t zero_vec_size = max_l1_loading_size; + auto zero_vec_addr = HalL1MemAddrType::UNRESERVED; + if (hal.get_eth_fw_is_cooperative()) { + zero_vec_size -= + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::TILE_HEADER_BUFFER); + zero_vec_addr = HalL1MemAddrType::TILE_HEADER_BUFFER; + } + static std::vector zero_vec(zero_vec_size / sizeof(uint32_t), 0); CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); llrt::write_hex_vec_to_core( - this->id(), - virtual_core, - zero_vec_above_tile_header_buffer, - hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::TILE_HEADER_BUFFER)); + this->id(), virtual_core, zero_vec, hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, zero_vec_addr)); } // TODO: clear idle eriscs as well tt::Cluster::instance().l1_barrier(this->id()); @@ -1018,6 +1047,17 @@ bool Device::close() { } } + if (!hal.get_eth_fw_is_cooperative()) { + for (const auto& eth_core : this->get_active_ethernet_cores()) { + CoreCoord virtual_eth_core = this->ethernet_core_from_logical_core(eth_core); + TensixSoftResetOptions reset_val = + TENSIX_ASSERT_SOFT_RESET & + static_cast( + ~std::underlying_type::type(TensixSoftResetOptions::BRISC)); + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_eth_core), reset_val); + } + } + if (this->id_ != mmio_device_id) { for (auto it = not_done_dispatch_cores[mmio_device_id].begin(); it != not_done_dispatch_cores[mmio_device_id].end(); it++) { const auto &virtual_core = *it; diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 2f0c7a1f69b..176d9e8741d 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -436,7 +436,6 @@ void EthernetKernel::read_binaries(IDevice* device) { int erisc_id = magic_enum::enum_integer(this->config_.processor); const JitBuildState& build_state = BuildEnvManager::get_instance().get_kernel_build_state( device->build_id(), erisc_core_type, dm_class_idx, erisc_id); - int risc_id = erisc_id + (this->config_.eth_mode == Eth::IDLE ? 6 : 5); // TODO (abhullar): clean this up when llrt helpers use HAL // TODO: fix when active eth supports relo auto load_type = (this->config_.eth_mode == Eth::IDLE) ? ll_api::memory::Loading::CONTIGUOUS_XIP : ll_api::memory::Loading::DISCRETE; diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index afac9cb6c7b..6c22241d3c5 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -151,14 +151,13 @@ JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, } case HalProgrammableCoreType::ACTIVE_ETH: { // Cooperative means active erisc FW needs to context switch to base FW - bool is_cooperative = tt::Cluster::instance().arch() == ARCH::WORMHOLE_B0; return std::make_shared( build_env, JitBuiltStateConfig{ .processor_id = processor_class, .is_fw = is_fw, .dispatch_message_addr = dispatch_message_addr, - .is_cooperative = is_cooperative}); + .is_cooperative = hal.get_eth_fw_is_cooperative()}); break; } case HalProgrammableCoreType::IDLE_ETH: { diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp index 9da9dbaf725..9eccb172bc3 100644 --- a/tt_metal/llrt/blackhole/bh_hal.cpp +++ b/tt_metal/llrt/blackhole/bh_hal.cpp @@ -114,6 +114,7 @@ void Hal::initialize_bh() { this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; + this->eth_fw_is_cooperative_ = false; this->eps_ = EPS_BH; this->nan_ = NAN_BH; diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 28442356dcd..168b6c2b3bd 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -216,6 +216,7 @@ void Hal::initialize_gs() { this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; + this->eth_fw_is_cooperative_ = false; this->eps_ = EPS_GS; this->nan_ = NAN_GS; diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index bba079b97b5..5632110e6d4 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -206,8 +206,7 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC run_state, RUN_MSG_DONE); TT_FATAL( - run == run_state || run == RUN_MSG_DONE, - "Read unexpected run_mailbox value"); + run == run_state || run == RUN_MSG_DONE, "Read unexpected run_mailbox value from core {}", core.str()); } return run == RUN_MSG_DONE; diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 11e3fdc30cc..d71967d9d82 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -104,8 +104,6 @@ Cluster::Cluster() { routing_info_addr_ = tt::tt_metal::hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::APP_ROUTING_INFO); } - this->generate_cluster_descriptor(); - this->initialize_device_drivers(); this->reserve_ethernet_cores_for_tunneling(); @@ -142,11 +140,11 @@ void Cluster::generate_cluster_descriptor() { // Cluster descriptor yaml not available for Blackhole bring up if (this->target_type_ == TargetDevice::Simulator) { // Passing simulator reported physical devices as logical devices. - this->cluster_desc_ = tt_ClusterDescriptor::create_mock_cluster(tt_SimulationDevice::detect_available_device_ids(), this->arch_); + this->cluster_desc_ = + tt_ClusterDescriptor::create_mock_cluster(tt_SimulationDevice::detect_available_device_ids(), this->arch_) + .get(); } else { - this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); - - // Detect cluster type + this->cluster_desc_ = this->driver_->get_cluster_description(); for (const auto &chip_id : this->cluster_desc_->get_all_chips()) { if (this->cluster_desc_->get_board_type(chip_id) == BoardType::GALAXY) { this->cluster_type_ = ClusterType::TG; @@ -207,12 +205,14 @@ void Cluster::generate_cluster_descriptor() { } void Cluster::initialize_device_drivers() { + this->open_driver(); + this->generate_cluster_descriptor(); + this->get_metal_desc_from_tt_desc(); + for (const auto &[mmio_device_id, controlled_devices] : this->devices_grouped_by_assoc_mmio_device_) { this->assign_mem_channels_to_devices(mmio_device_id, controlled_devices); } - this->open_driver(); - tt_device_params default_params; this->start_driver(default_params); this->generate_virtual_to_umd_coord_mapping(); @@ -255,7 +255,7 @@ void Cluster::open_driver(const bool &skip_driver_allocs) { std::unique_ptr device_driver; if (this->target_type_ == TargetDevice::Silicon) { const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_); - std::unordered_set all_chips = this->cluster_desc_->get_all_chips(); + const auto& all_chips = tt::umd::Cluster::detect_available_device_ids(); std::set all_chips_set(all_chips.begin(), all_chips.end()); // This is the target/desired number of mem channels per arch/device. // Silicon driver will attempt to open this many hugepages as channels per mmio chip, @@ -296,7 +296,6 @@ void Cluster::open_driver(const bool &skip_driver_allocs) { device_driver->set_barrier_address_params(barrier_params); this->driver_ = std::move(device_driver); - this->get_metal_desc_from_tt_desc(); } void Cluster::start_driver(tt_device_params &device_params) const { @@ -903,14 +902,28 @@ std::unordered_set Cluster::get_ethernet_connected_device_ids(chip_id std::unordered_set Cluster::get_active_ethernet_cores( chip_id_t chip_id, bool skip_reserved_tunnel_cores) const { std::unordered_set active_ethernet_cores; - const auto &connected_chips = this->get_ethernet_cores_grouped_by_connected_chips(chip_id); - for (const auto &[other_chip_id, eth_cores] : connected_chips) { - for (const auto ð_core : eth_cores) { - if (this->device_eth_routing_info_.at(chip_id).at(eth_core) == EthRouterMode::BI_DIR_TUNNELING and - skip_reserved_tunnel_cores) { - continue; + if (arch_ == ARCH::BLACKHOLE) { + // Can't just use `get_ethernet_cores_grouped_by_connected_chips` because there are some active ethernet cores + // without links. Only risc1 on these cores is available for Metal and should not be classified as idle + // to ensure that Metal does not try to program both riscs. + const auto& soc_desc = get_soc_desc(chip_id); + std::set logical_active_eth_channels = cluster_desc_->get_active_eth_channels(chip_id); + for (auto logical_active_eth_channel : logical_active_eth_channels) { + CoreCoord logical_active_eth = + soc_desc.get_eth_core_for_channel(logical_active_eth_channel, CoordSystem::LOGICAL); + active_ethernet_cores.insert(logical_active_eth); + } + + } else { + const auto& connected_chips = this->get_ethernet_cores_grouped_by_connected_chips(chip_id); + for (const auto& [other_chip_id, eth_cores] : connected_chips) { + for (const auto& eth_core : eth_cores) { + if (this->device_eth_routing_info_.at(chip_id).at(eth_core) == EthRouterMode::BI_DIR_TUNNELING and + skip_reserved_tunnel_cores) { + continue; + } + active_ethernet_cores.insert(eth_core); } - active_ethernet_cores.insert(eth_core); } } return active_ethernet_cores; @@ -922,7 +935,6 @@ std::unordered_set Cluster::get_inactive_ethernet_cores(chip_id_t chi std::unordered_set channels_to_skip = {}; // UMD routing FW uses these cores for base routing // channel 15 is used by syseng tools. - // TODO (abhullar): For BH single-chip bringup we assume all ethernet cores are inactive. Update this with (#9823) if (this->is_galaxy_cluster()) { // TODO: This may need to change, if we need additional eth cores for dispatch on Galaxy channels_to_skip = {0, 1, 2, 3, 15}; @@ -941,6 +953,12 @@ std::unordered_set Cluster::get_inactive_ethernet_cores(chip_id_t chi return inactive_ethernet_cores; } +bool Cluster::is_ethernet_link_up(chip_id_t chip_id, const CoreCoord& logical_core) const { + const auto& soc_desc = get_soc_desc(chip_id); + ethernet_channel_t eth_chan = soc_desc.logical_eth_core_to_chan_map.at(logical_core); + return this->cluster_desc_->ethernet_core_has_active_ethernet_link(chip_id, eth_chan); +} + std::tuple Cluster::get_connected_ethernet_core(std::tuple eth_core) const { const auto &soc_desc = get_soc_desc(std::get<0>(eth_core)); ethernet_channel_t eth_chan = soc_desc.logical_eth_core_to_chan_map.at(std::get<1>(eth_core)); diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 34f56508a75..d5bb3ca5e6a 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -182,6 +182,9 @@ class Cluster { // Returns set of logical inactive ethernet coordinates on chip std::unordered_set get_inactive_ethernet_cores(chip_id_t chip_id) const; + // Returns whether `logical_core` has an eth link to a core on a connected chip + bool is_ethernet_link_up(chip_id_t chip_id, const CoreCoord& logical_core) const; + // Returns connected ethernet core on the other chip std::tuple get_connected_ethernet_core(std::tuple eth_core) const; @@ -302,7 +305,7 @@ class Cluster { // Need to hold reference to cluster descriptor to detect total number of devices available in cluster // UMD static APIs `detect_available_device_ids` and `detect_number_of_chips` only returns number of MMIO mapped // devices - std::unique_ptr cluster_desc_; + tt_ClusterDescriptor* cluster_desc_; // There is an entry for every device that can be targeted (MMIO and remote) std::unordered_map sdesc_per_chip_; diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp index d1f82e96496..338d2b15cd6 100644 --- a/tt_metal/llrt/wormhole/wh_hal.cpp +++ b/tt_metal/llrt/wormhole/wh_hal.cpp @@ -114,6 +114,7 @@ void Hal::initialize_wh() { this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; + this->eth_fw_is_cooperative_ = true; this->eps_ = EPS_WHB0; this->nan_ = NAN_WHB0; diff --git a/tt_metal/soc_descriptors/blackhole_140_arch.yaml b/tt_metal/soc_descriptors/blackhole_140_arch.yaml index eb86dad45dc..a3333948c5f 100644 --- a/tt_metal/soc_descriptors/blackhole_140_arch.yaml +++ b/tt_metal/soc_descriptors/blackhole_140_arch.yaml @@ -80,7 +80,7 @@ dram_view_size: eth: [ - #1-1, 2-1, 3-1, 4-1, 5-1, 6-1, 7-1, 10-1, 11-1, 12-1, 13-1, 14-1, 15-1, 16-1, + 1-1, 16-1, 2-1, 15-1, 3-1, 14-1, 4-1, 13-1, 5-1, 12-1, 6-1, 11-1, 7-1, 10-1, ] functional_workers: diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index 89e23562388..da0cc30846b 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit 89e235623881014e475cdfcd172c2222217346a4 +Subproject commit da0cc30846b2773dc41aeb34c468cea97120d4a8