diff --git a/doc/en/transfer-engine.md b/doc/en/transfer-engine.md index 923a0f2..573ad81 100644 --- a/doc/en/transfer-engine.md +++ b/doc/en/transfer-engine.md @@ -61,7 +61,7 @@ For instance, as illustrated in figure above, to transfer data from buffer 0 (as To further maximize bandwidth utilization, if a single request's transfer is internally divided into multiple slices if its length exeeds 16KB. Each slice might use a different path, enabling collaborative work among all RDMA NICs. -If you do not want to manually configure the topology matrix, we also provide a function (`mooncake::discoverTopologyMatrix` in `topology.h`) to automatically discover the toplogy between CPU/CUDA and RDMA devices. The automatic discovery mechanism might not always be accurate, and we welcome your feedbacks and improvement ideas! +If you do not want to manually configure the topology matrix, we also provide a function (`mooncake::discoverTopologyMatrix` in `topology.h`) to automatically discover the toplogy between CPU/CUDA and RDMA devices. Supports for more device types are working in progress. The automatic discovery mechanism might not always be accurate, and we welcome your feedbacks and improvement ideas! ### Endpoint Management Mooncake Store employs a pair of end- diff --git a/doc/zh/transfer-engine.md b/doc/zh/transfer-engine.md index bc47a51..f382e02 100644 --- a/doc/zh/transfer-engine.md +++ b/doc/zh/transfer-engine.md @@ -52,6 +52,8 @@ BatchTransfer API 使用请求(Request)对象数组传入用户请求,需 为了进一步最大化带宽利用率,如果单个请求的传输长度超过16KB,则其内部被划分为多个切片。每个切片可能使用不同的路径,使所有RDMA NIC能够协同工作。 +如果不想手动配置拓扑矩阵,也可以直接调用`mooncake::discoverTopologyMatrix`(位于`topology.h`)来自动生成拓扑矩阵。该函数能够自动探查CPU/CUDA和RDMA网卡之间的拓扑关系。对于更多设备种类的支持正在开发中。目前,拓扑自动发现机制可能无法给出准确的硬件拓扑,我们欢迎您的反馈和改进建议! + ### 端点管理 Transfer Engine 使用一对端点来表示本地RDMA NIC和远程RDMA NIC之间的连接。实际上,每个端点包括一个或多个RDMA QP对象。 Transfer Engine 中的连接是按需建立的;端点在第一次请求之前保持未配对状态。 diff --git a/mooncake-transfer-engine/src/topology.cpp b/mooncake-transfer-engine/src/topology.cpp index 8a71f0c..3827689 100644 --- a/mooncake-transfer-engine/src/topology.cpp +++ b/mooncake-transfer-engine/src/topology.cpp @@ -9,7 +9,7 @@ #include #ifdef USE_CUDA -#include "cuda_runtime.h" +#include #endif #include @@ -66,6 +66,9 @@ static std::vector list_infiniband_devices() { char path[PATH_MAX]; char resolved_path[PATH_MAX]; + // Get the PCI bus id for the infiniband device. Note that + // "/sys/class/infiniband/mlx5_X/" is a symlink to + // "/sys/devices/pciXXXX:XX/XXXX:XX:XX.X/infiniband/mlx5_X/". snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..", entry->d_name); if (realpath(path, resolved_path) == NULL) { @@ -105,6 +108,7 @@ static std::vector discover_cpu_topology( int node_id = atoi(entry->d_name + strlen(prefix)); std::vector preferred_hca; std::vector avail_hca; + // an HCA connected to the same cpu NUMA node is preferred for (const auto &hca : all_hca) { if (hca.numa_node == node_id) { preferred_hca.push_back(hca.name); @@ -172,6 +176,8 @@ static std::vector discover_cuda_topology( std::vector preferred_hca; std::vector avail_hca; for (const auto &hca : all_hca) { + // FIXME: currently we only identify the NICs connected to the same + // PCIe switch/RC with GPU as preferred. if (get_pci_distance(hca.pci_bus_id.c_str(), pci_bus_id) == 0) { preferred_hca.push_back(hca.name); } else { @@ -189,6 +195,7 @@ static std::vector discover_cuda_topology( #endif // USE_CUDA namespace mooncake { +// TODO: add black/white lists for devices. std::string discoverTopologyMatrix() { auto all_hca = list_infiniband_devices(); Json::Value value(Json::objectValue);