diff --git a/doc/en/transfer-engine.md b/doc/en/transfer-engine.md index 0f01181..923a0f2 100644 --- a/doc/en/transfer-engine.md +++ b/doc/en/transfer-engine.md @@ -61,6 +61,8 @@ For instance, as illustrated in figure above, to transfer data from buffer 0 (as To further maximize bandwidth utilization, if a single request's transfer is internally divided into multiple slices if its length exeeds 16KB. Each slice might use a different path, enabling collaborative work among all RDMA NICs. +If you do not want to manually configure the topology matrix, we also provide a function (`mooncake::discoverTopologyMatrix` in `topology.h`) to automatically discover the toplogy between CPU/CUDA and RDMA devices. The automatic discovery mechanism might not always be accurate, and we welcome your feedbacks and improvement ideas! + ### Endpoint Management Mooncake Store employs a pair of end- points to represent the connection between a local RDMA diff --git a/mooncake-transfer-engine/include/topology.h b/mooncake-transfer-engine/include/topology.h new file mode 100644 index 0000000..e31ba50 --- /dev/null +++ b/mooncake-transfer-engine/include/topology.h @@ -0,0 +1,5 @@ +#include + +namespace mooncake { +std::string discoverTopologyMatrix(); +} diff --git a/mooncake-transfer-engine/src/topology.cpp b/mooncake-transfer-engine/src/topology.cpp new file mode 100644 index 0000000..8a71f0c --- /dev/null +++ b/mooncake-transfer-engine/src/topology.cpp @@ -0,0 +1,205 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef USE_CUDA +#include "cuda_runtime.h" +#endif + +#include +#include +#include +#include +#include +#include + +#include "topology.h" + +struct InfinibandDevice { + std::string name; + std::string pci_bus_id; + int numa_node; +}; + +struct TopologyEntry { + std::string name; + std::vector preferred_hca; + std::vector avail_hca; + + Json::Value to_json() { + Json::Value matrix(Json::arrayValue); + Json::Value hca_list(Json::arrayValue); + for (auto &hca : preferred_hca) { + hca_list.append(hca); + } + matrix.append(hca_list); + hca_list.clear(); + for (auto &hca : avail_hca) { + hca_list.append(hca); + } + matrix.append(hca_list); + return matrix; + } +}; + +static std::vector list_infiniband_devices() { + DIR *dir = opendir("/sys/class/infiniband"); + struct dirent *entry; + std::vector devices; + + if (dir == NULL) { + LOG(WARNING) << "failed to list /sys/class/infiniband"; + return {}; + } + while ((entry = readdir(dir))) { + if (entry->d_name[0] == '.') { + continue; + } + + std::string device_name = entry->d_name; + + char path[PATH_MAX]; + char resolved_path[PATH_MAX]; + snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..", + entry->d_name); + if (realpath(path, resolved_path) == NULL) { + LOG(ERROR) << "realpath: " << strerror(errno); + continue; + } + std::string pci_bus_id = basename(resolved_path); + + int numa_node = -1; + snprintf(path, sizeof(path), "%s/numa_node", resolved_path); + std::ifstream(path) >> numa_node; + + devices.push_back(InfinibandDevice{.name = std::move(device_name), + .pci_bus_id = std::move(pci_bus_id), + .numa_node = numa_node}); + } + (void)closedir(dir); + return devices; +} + +static std::vector discover_cpu_topology( + const std::vector &all_hca) { + DIR *dir = opendir("/sys/devices/system/node"); + struct dirent *entry; + std::vector topology; + + if (dir == NULL) { + LOG(WARNING) << "failed to list /sys/devices/system/node"; + return {}; + } + while ((entry = readdir(dir))) { + const char *prefix = "node"; + if (entry->d_type != DT_DIR || + strncmp(entry->d_name, prefix, strlen(prefix)) != 0) { + continue; + } + int node_id = atoi(entry->d_name + strlen(prefix)); + std::vector preferred_hca; + std::vector avail_hca; + for (const auto &hca : all_hca) { + if (hca.numa_node == node_id) { + preferred_hca.push_back(hca.name); + } else { + avail_hca.push_back(hca.name); + } + } + topology.push_back( + TopologyEntry{.name = "cpu:" + std::to_string(node_id), + .preferred_hca = std::move(preferred_hca), + .avail_hca = std::move(avail_hca)}); + } + (void)closedir(dir); + return topology; +} + +#ifdef USE_CUDA + +static int get_pci_distance(const char *bus1, const char *bus2) { + char buf[PATH_MAX]; + char path1[PATH_MAX]; + char path2[PATH_MAX]; + snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus1); + if (realpath(buf, path1) == NULL) { + return -1; + } + snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus2); + if (realpath(buf, path2) == NULL) { + return -1; + } + + char *ptr1 = path1; + char *ptr2 = path2; + while (*ptr1 && *ptr1 == *ptr2) { + ptr1++; + ptr2++; + } + int distance = 0; + for (; *ptr1; ptr1++) { + distance += (*ptr1 == '/'); + } + for (; *ptr2; ptr2++) { + distance += (*ptr2 == '/'); + } + + return distance; +} + +static std::vector discover_cuda_topology( + const std::vector &all_hca) { + std::vector topology; + int device_count; + if (cudaGetDeviceCount(&device_count) != cudaSuccess) { + device_count = 0; + } + for (int i = 0; i < device_count; i++) { + char pci_bus_id[20]; + if (cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i) != + cudaSuccess) { + continue; + } + for (char *ch = pci_bus_id; (*ch = tolower(*ch)); ch++) + ; + + std::vector preferred_hca; + std::vector avail_hca; + for (const auto &hca : all_hca) { + if (get_pci_distance(hca.pci_bus_id.c_str(), pci_bus_id) == 0) { + preferred_hca.push_back(hca.name); + } else { + avail_hca.push_back(hca.name); + } + } + topology.push_back( + TopologyEntry{.name = "cuda:" + std::to_string(i), + .preferred_hca = std::move(preferred_hca), + .avail_hca = std::move(avail_hca)}); + } + return topology; +} + +#endif // USE_CUDA + +namespace mooncake { +std::string discoverTopologyMatrix() { + auto all_hca = list_infiniband_devices(); + Json::Value value(Json::objectValue); + for (auto &ent : discover_cpu_topology(all_hca)) { + value[ent.name] = ent.to_json(); + } +#ifdef USE_CUDA + for (auto &ent : discover_cuda_topology(all_hca)) { + value[ent.name] = ent.to_json(); + } +#endif + return value.toStyledString(); +} +} // namespace mooncake diff --git a/mooncake-transfer-engine/tests/CMakeLists.txt b/mooncake-transfer-engine/tests/CMakeLists.txt index d77bd50..497f2f1 100644 --- a/mooncake-transfer-engine/tests/CMakeLists.txt +++ b/mooncake-transfer-engine/tests/CMakeLists.txt @@ -4,7 +4,6 @@ target_link_libraries(rdma_transport_test PUBLIC transfer_engine) add_executable(transport_uint_test transport_uint_test.cpp) target_link_libraries(transport_uint_test PUBLIC transfer_engine gtest gtest_main ) - add_executable(rdma_transport_test2 rdma_transport_test2.cpp) target_link_libraries(rdma_transport_test2 PUBLIC transfer_engine gtest gtest_main ) @@ -12,4 +11,7 @@ add_executable(tcp_transport_test tcp_transport_test.cpp) target_link_libraries(tcp_transport_test PUBLIC transfer_engine gtest gtest_main ) add_executable(transfer_metadata_test transfer_metadata_test.cpp) -target_link_libraries(transfer_metadata_test PUBLIC transfer_engine gtest gtest_main) \ No newline at end of file +target_link_libraries(transfer_metadata_test PUBLIC transfer_engine gtest gtest_main) + +add_executable(topology_test topology_test.cpp) +target_link_libraries(topology_test PUBLIC transfer_engine gtest gtest_main) diff --git a/mooncake-transfer-engine/tests/topology_test.cpp b/mooncake-transfer-engine/tests/topology_test.cpp new file mode 100644 index 0000000..ec5c342 --- /dev/null +++ b/mooncake-transfer-engine/tests/topology_test.cpp @@ -0,0 +1,21 @@ +#include "topology.h" + +#include +#include + +#include "transfer_metadata.h" + +TEST(ToplogyTest, GetTopologyMatrix) +{ + std::string topo = mooncake::discoverTopologyMatrix(); + LOG(INFO) << topo; + mooncake::TransferMetadata::PriorityMatrix matrix; + std::vector rnic_list; + mooncake::TransferMetadata::parseNicPriorityMatrix(topo, matrix, rnic_list); +} + +int main(int argc, char **argv) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}