diff --git a/doc/en/transfer-engine.md b/doc/en/transfer-engine.md
index 923a0f2..573ad81 100644
--- a/doc/en/transfer-engine.md
+++ b/doc/en/transfer-engine.md
@@ -61,7 +61,7 @@ For instance, as illustrated in figure above, to transfer data from buffer 0 (as
 To further maximize bandwidth utilization, if a single request's transfer is internally divided into multiple slices if its length exeeds 16KB. 
 Each slice might use a different path, enabling collaborative work among all RDMA NICs.
 
-If you do not want to manually configure the topology matrix, we also provide a function (`mooncake::discoverTopologyMatrix` in `topology.h`) to automatically discover the toplogy between CPU/CUDA and RDMA devices. The automatic discovery mechanism might not always be accurate, and we welcome your feedbacks and improvement ideas!
+If you do not want to manually configure the topology matrix, we also provide a function (`mooncake::discoverTopologyMatrix` in `topology.h`) to automatically discover the toplogy between CPU/CUDA and RDMA devices. Supports for more device types are working in progress. The automatic discovery mechanism might not always be accurate, and we welcome your feedbacks and improvement ideas!
 
 ### Endpoint Management
 Mooncake Store employs a pair of end-
diff --git a/doc/zh/transfer-engine.md b/doc/zh/transfer-engine.md
index bc47a51..f382e02 100644
--- a/doc/zh/transfer-engine.md
+++ b/doc/zh/transfer-engine.md
@@ -52,6 +52,8 @@ BatchTransfer API 使用请求（Request）对象数组传入用户请求，需
 
 为了进一步最大化带宽利用率，如果单个请求的传输长度超过16KB，则其内部被划分为多个切片。每个切片可能使用不同的路径，使所有RDMA NIC能够协同工作。
 
+如果不想手动配置拓扑矩阵，也可以直接调用`mooncake::discoverTopologyMatrix`（位于`topology.h`）来自动生成拓扑矩阵。该函数能够自动探查CPU/CUDA和RDMA网卡之间的拓扑关系。对于更多设备种类的支持正在开发中。目前，拓扑自动发现机制可能无法给出准确的硬件拓扑，我们欢迎您的反馈和改进建议！
+
 ### 端点管理
 Transfer Engine 使用一对端点来表示本地RDMA NIC和远程RDMA NIC之间的连接。实际上，每个端点包括一个或多个RDMA QP对象。
 Transfer Engine 中的连接是按需建立的；端点在第一次请求之前保持未配对状态。
diff --git a/mooncake-transfer-engine/src/topology.cpp b/mooncake-transfer-engine/src/topology.cpp
index 8a71f0c..3827689 100644
--- a/mooncake-transfer-engine/src/topology.cpp
+++ b/mooncake-transfer-engine/src/topology.cpp
@@ -9,7 +9,7 @@
 #include <vector>
 
 #ifdef USE_CUDA
-#include "cuda_runtime.h"
+#include <cuda_runtime.h>
 #endif
 
 #include <ctype.h>
@@ -66,6 +66,9 @@ static std::vector<InfinibandDevice> list_infiniband_devices() {
 
         char path[PATH_MAX];
         char resolved_path[PATH_MAX];
+        // Get the PCI bus id for the infiniband device. Note that
+        // "/sys/class/infiniband/mlx5_X/" is a symlink to
+        // "/sys/devices/pciXXXX:XX/XXXX:XX:XX.X/infiniband/mlx5_X/".
         snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..",
                  entry->d_name);
         if (realpath(path, resolved_path) == NULL) {
@@ -105,6 +108,7 @@ static std::vector<TopologyEntry> discover_cpu_topology(
         int node_id = atoi(entry->d_name + strlen(prefix));
         std::vector<std::string> preferred_hca;
         std::vector<std::string> avail_hca;
+        // an HCA connected to the same cpu NUMA node is preferred
         for (const auto &hca : all_hca) {
             if (hca.numa_node == node_id) {
                 preferred_hca.push_back(hca.name);
@@ -172,6 +176,8 @@ static std::vector<TopologyEntry> discover_cuda_topology(
         std::vector<std::string> preferred_hca;
         std::vector<std::string> avail_hca;
         for (const auto &hca : all_hca) {
+            // FIXME: currently we only identify the NICs connected to the same
+            // PCIe switch/RC with GPU as preferred.
             if (get_pci_distance(hca.pci_bus_id.c_str(), pci_bus_id) == 0) {
                 preferred_hca.push_back(hca.name);
             } else {
@@ -189,6 +195,7 @@ static std::vector<TopologyEntry> discover_cuda_topology(
 #endif  // USE_CUDA
 
 namespace mooncake {
+// TODO: add black/white lists for devices.
 std::string discoverTopologyMatrix() {
     auto all_hca = list_infiniband_devices();
     Json::Value value(Json::objectValue);