From 75f09691aeaf8dd9515d585dd0094a014aea630a Mon Sep 17 00:00:00 2001 From: Chen Shaoyuan Date: Thu, 19 Dec 2024 18:45:34 +0800 Subject: [PATCH] add topology discovery --- mooncake-transfer-engine/include/topology.h | 6 + mooncake-transfer-engine/src/topology.cpp | 235 ++++++++++++++++++ mooncake-transfer-engine/tests/CMakeLists.txt | 6 +- .../tests/topology_test.cpp | 20 ++ 4 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 mooncake-transfer-engine/include/topology.h create mode 100644 mooncake-transfer-engine/src/topology.cpp create mode 100644 mooncake-transfer-engine/tests/topology_test.cpp diff --git a/mooncake-transfer-engine/include/topology.h b/mooncake-transfer-engine/include/topology.h new file mode 100644 index 0000000..988e292 --- /dev/null +++ b/mooncake-transfer-engine/include/topology.h @@ -0,0 +1,6 @@ +#include + +namespace mooncake +{ + std::string discoverTopologyMatrix(); +} diff --git a/mooncake-transfer-engine/src/topology.cpp b/mooncake-transfer-engine/src/topology.cpp new file mode 100644 index 0000000..5f49808 --- /dev/null +++ b/mooncake-transfer-engine/src/topology.cpp @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef USE_CUDA +#include "cuda_runtime.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "topology.h" + +struct InfinibandDevice +{ + std::string name; + std::string pci_bus_id; + int numa_node; +}; + +struct TopologyEntry +{ + std::string name; + std::vector preferred_hca; + std::vector avail_hca; + + Json::Value to_json() + { + Json::Value matrix(Json::arrayValue); + Json::Value hca_list(Json::arrayValue); + for (auto &hca : preferred_hca) + { + hca_list.append(hca); + } + matrix.append(hca_list); + hca_list.clear(); + for (auto &hca : avail_hca) + { + hca_list.append(hca); + } + matrix.append(hca_list); + return matrix; + } +}; + +static std::vector list_infiniband_devices() +{ + DIR *dir = opendir("/sys/class/infiniband"); + struct dirent *entry; + std::vector devices; + + if (dir == NULL) + { + LOG(WARNING) << "failed to list /sys/class/infiniband"; + return {}; + } + while ((entry = readdir(dir))) + { + if (entry->d_name[0] == '.') + { + continue; + } + + std::string device_name = entry->d_name; + + char path[PATH_MAX]; + char resolved_path[PATH_MAX]; + snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..", entry->d_name); + if (realpath(path, resolved_path) == NULL) + { + LOG(ERROR) << "realpath: " << strerror(errno); + continue; + } + std::string pci_bus_id = basename(resolved_path); + + int numa_node = -1; + snprintf(path, sizeof(path), "%s/numa_node", resolved_path); + std::ifstream(path) >> numa_node; + + devices.push_back(InfinibandDevice{.name = std::move(device_name), + .pci_bus_id = std::move(pci_bus_id), + .numa_node = numa_node}); + } + (void)closedir(dir); + return devices; +} + +static std::vector discover_cpu_topology(const std::vector &all_hca) +{ + DIR *dir = opendir("/sys/devices/system/node"); + struct dirent *entry; + std::vector topology; + + if (dir == NULL) + { + LOG(WARNING) << "failed to list /sys/devices/system/node"; + return {}; + } + while ((entry = readdir(dir))) + { + const char *prefix = "node"; + if (entry->d_type != DT_DIR || strncmp(entry->d_name, prefix, strlen(prefix)) != 0) + { + continue; + } + int node_id = atoi(entry->d_name + strlen(prefix)); + std::vector preferred_hca; + std::vector avail_hca; + for (const auto &hca : all_hca) + { + if (hca.numa_node == node_id) + { + preferred_hca.push_back(hca.name); + } + else + { + avail_hca.push_back(hca.name); + } + } + topology.push_back(TopologyEntry{.name = "cpu:" + std::to_string(node_id), + .preferred_hca = std::move(preferred_hca), + .avail_hca = std::move(avail_hca)}); + } + (void)closedir(dir); + return topology; +} + +#ifdef USE_CUDA + +static int get_pci_distance(const char *bus1, const char *bus2) +{ + char buf[PATH_MAX]; + char path1[PATH_MAX]; + char path2[PATH_MAX]; + snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus1); + if (realpath(buf, path1) == NULL) + { + return -1; + } + snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus2); + if (realpath(buf, path2) == NULL) + { + return -1; + } + + char *ptr1 = path1; + char *ptr2 = path2; + while (*ptr1 && *ptr1 == *ptr2) + { + ptr1++; + ptr2++; + } + int distance = 0; + for (; *ptr1; ptr1++) + { + distance += (*ptr1 == '/'); + } + for (; *ptr2; ptr2++) + { + distance += (*ptr2 == '/'); + } + + return distance; +} + +static std::vector discover_cuda_topology(const std::vector &all_hca) +{ + std::vector topology; + int device_count; + if (cudaGetDeviceCount(&device_count) != cudaSuccess) + { + device_count = 0; + } + for (int i = 0; i < device_count; i++) + { + char pci_bus_id[20]; + if (cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i) != cudaSuccess) + { + continue; + } + for (char *ch = pci_bus_id; (*ch = tolower(*ch)); ch++) + ; + + std::vector preferred_hca; + std::vector avail_hca; + for (const auto &hca : all_hca) + { + if (get_pci_distance(hca.pci_bus_id.c_str(), pci_bus_id) == 0) + { + preferred_hca.push_back(hca.name); + } + else + { + avail_hca.push_back(hca.name); + } + } + topology.push_back(TopologyEntry{.name = "cuda:" + std::to_string(i), + .preferred_hca = std::move(preferred_hca), + .avail_hca = std::move(avail_hca)}); + } + return topology; +} + +#endif // USE_CUDA + +namespace mooncake +{ + std::string discoverTopologyMatrix() + { + auto all_hca = list_infiniband_devices(); + Json::Value value(Json::objectValue); + for (auto &ent : discover_cpu_topology(all_hca)) + { + value[ent.name] = ent.to_json(); + } +#ifdef USE_CUDA + for (auto &ent : discover_cuda_topology(all_hca)) + { + value[ent.name] = ent.to_json(); + } +#endif + return value.toStyledString(); + } +} diff --git a/mooncake-transfer-engine/tests/CMakeLists.txt b/mooncake-transfer-engine/tests/CMakeLists.txt index d77bd50..497f2f1 100644 --- a/mooncake-transfer-engine/tests/CMakeLists.txt +++ b/mooncake-transfer-engine/tests/CMakeLists.txt @@ -4,7 +4,6 @@ target_link_libraries(rdma_transport_test PUBLIC transfer_engine) add_executable(transport_uint_test transport_uint_test.cpp) target_link_libraries(transport_uint_test PUBLIC transfer_engine gtest gtest_main ) - add_executable(rdma_transport_test2 rdma_transport_test2.cpp) target_link_libraries(rdma_transport_test2 PUBLIC transfer_engine gtest gtest_main ) @@ -12,4 +11,7 @@ add_executable(tcp_transport_test tcp_transport_test.cpp) target_link_libraries(tcp_transport_test PUBLIC transfer_engine gtest gtest_main ) add_executable(transfer_metadata_test transfer_metadata_test.cpp) -target_link_libraries(transfer_metadata_test PUBLIC transfer_engine gtest gtest_main) \ No newline at end of file +target_link_libraries(transfer_metadata_test PUBLIC transfer_engine gtest gtest_main) + +add_executable(topology_test topology_test.cpp) +target_link_libraries(topology_test PUBLIC transfer_engine gtest gtest_main) diff --git a/mooncake-transfer-engine/tests/topology_test.cpp b/mooncake-transfer-engine/tests/topology_test.cpp new file mode 100644 index 0000000..b98a01b --- /dev/null +++ b/mooncake-transfer-engine/tests/topology_test.cpp @@ -0,0 +1,20 @@ +#include +#include + +#include "transfer_metadata.h" +#include "topology.h" + +TEST(ToplogyTest, GetTopologyMatrix) +{ + std::string topo = mooncake::discoverTopologyMatrix(); + LOG(INFO) << topo; + mooncake::TransferMetadata::PriorityMatrix matrix; + std::vector rnic_list; + mooncake::TransferMetadata::parseNicPriorityMatrix(topo, matrix, rnic_list); +} + +int main(int argc, char **argv) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file