From af836e6d69017dddfc9938bce9c0a6b8fc8bca70 Mon Sep 17 00:00:00 2001 From: Maximilian Date: Thu, 14 Nov 2024 13:20:19 +0100 Subject: [PATCH] Added SW-support for AES, DPI and compression --- examples_sw/apps/rdma_service/client/main.cpp | 28 +++++++++++++++---- examples_sw/apps/rdma_service/server/main.cpp | 27 +++++++++++++++--- sw/include/bThread.hpp | 2 +- sw/include/cDefs.hpp | 16 ++++++++--- sw/include/cLib.hpp | 21 ++++++++++++++ sw/src/bThread.cpp | 20 ++++++++++++- sw/src/cService.cpp | 22 +++++++++++++++ 7 files changed, 121 insertions(+), 15 deletions(-) diff --git a/examples_sw/apps/rdma_service/client/main.cpp b/examples_sw/apps/rdma_service/client/main.cpp index 37a194dd..fea76a98 100644 --- a/examples_sw/apps/rdma_service/client/main.cpp +++ b/examples_sw/apps/rdma_service/client/main.cpp @@ -188,6 +188,9 @@ int main(int argc, char *argv[]) sg.rdma.len = min_size; sg.rdma.local_stream = strmHost; + // Get a hMem to write values into the payload of the RDMA-packets + uint64_t *hMem = (uint64_t*)(cthread.getQpair()->local.vaddr); + // Set the Coyote Operation, which can either be a REMOTE_WRITE or a REMOTE_READ, depending on the settings for the experiment CoyoteOper coper = oper ? CoyoteOper::REMOTE_RDMA_WRITE : CoyoteOper::REMOTE_RDMA_READ;; @@ -213,17 +216,21 @@ int main(int argc, char *argv[]) // Lambda-function for throughput-benchmarking auto benchmark_thr = [&]() { // For the desired number of repetitions per size, invoke the cThread-Function with the coyote-Operation - for(int i = 0; i < n_reps_thr; i++) + for(int i = 0; i < n_reps_thr; i++) { # ifdef VERBOSE std::cout << "rdma_client: invoke the operation " << std::endl; # endif cthread.invoke(coper, &sg); + hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1; + std::cout << "CLIENT: Sent out message #" << i << " at message-size " << sg.rdma.len << " with content " << hMem[sg.rdma.len/8-1] << std::endl; + } + + // Increment the hMem-value + // hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1; // Check the number of completed RDMA-transactions, wait until all operations have been completed. Check for stalling in-between. while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) { - # ifdef VERBOSE - std::cout << "rdma_client: Current number of completed operations: " << cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl; - # endif + // std::cout << "CLIENT: Current number of completed operations: " << cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl; // stalled is an atomic boolean used for event-handling (?) that would indicate a stalled operation if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught"); } @@ -255,14 +262,25 @@ int main(int argc, char *argv[]) # ifdef VERBOSE std::cout << "rdma_client: invoke the operation " << std::endl; # endif + + // Increment the hMem-value + hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1; cthread.invoke(coper, &sg); + + std::cout << "CLIENT: Sent out message #" << i << " at message-size " << sg.rdma.len << " with content " << hMem[sg.rdma.len/8-1] << std::endl; + + bool message_written = false; while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { # ifdef VERBOSE std::cout << "rdma_client: Current number of completed operations: " << cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl; - # endif + # endif + // As long as the completion is not yet received, check for a possible stall-event if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught"); } + + std::cout << "CLIENT: Received an ACK for this message!" << std::endl; + std::cout << "CLIENT: Received the following memory content: " << hMem[sg.rdma.len/8-1] << std::endl; } }; diff --git a/examples_sw/apps/rdma_service/server/main.cpp b/examples_sw/apps/rdma_service/server/main.cpp index 20dc9c5f..512497a4 100644 --- a/examples_sw/apps/rdma_service/server/main.cpp +++ b/examples_sw/apps/rdma_service/server/main.cpp @@ -125,6 +125,9 @@ int main(int argc, char *argv[]) memset(&sg, 0, sizeof(rdmaSg)); sg.rdma.len = min_size; sg.rdma.local_stream = strmHost; + // Get a memory handle to manipulate values in the RDMA payloads + uint64_t *hMem = (uint64_t*)(cthread->getQpair()->local.vaddr); + while(sg.rdma.len <= max_size) { // Sync via the cThread that is part of the cService-daemon that was just started in the background # ifdef VERBOSE @@ -139,14 +142,19 @@ int main(int argc, char *argv[]) if(rdwr) { // THR - wait until all expected WRITEs are coming in. Incoming RDMA_WRITEs are LOCAL_WRITEs on this side - while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) { } - + while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) { + std::cout << "CLIENT: Current number of completed operations: " << cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) << std::endl; + } + // THR - issuing the same amount of "Write-Backs" to the client - for(int i = 0; i < n_reps_thr; i++) + for(int i = 0; i < n_reps_thr; i++) { # ifdef VERBOSE std::cout << "rdma_server: invoke the operation " << std::endl; # endif + hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1; cthread->invoke(CoyoteOper::REMOTE_RDMA_WRITE, &sg); + std::cout << "SERVER: Sent out message #" << i << " at message-size " << sg.rdma.len << " with content " << hMem[sg.rdma.len/8-1] << std::endl; + } // Sync via the thread that is located within the cService-daemon # ifdef VERBOSE @@ -161,7 +169,18 @@ int main(int argc, char *argv[]) // LAT - iterate over the number of ping-pong-exchanges according to the desired experiment setting for(int i = 0; i < n_reps_lat; i++) { // Wait for the next incoming WRITE - while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { } + bool message_written = false; + while(cthread->checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { + if(!message_written) { + std::cout << "RDMA-Server: Waiting for an incoming RDMA-WRITE at currently " << i << "." << std::endl; + message_written = true; + } + } + + // Increment the number in the payload before writing back + hMem[sg.rdma.len/8-1] = hMem[sg.rdma.len/8-1] + 1; + + std::cout << "RDMA-Server: Invoking a RDMA-WRITE from the Server to the Client at currently " << (i+1) << "." << std::endl; cthread->invoke(CoyoteOper::REMOTE_RDMA_WRITE, &sg); } } else { diff --git a/sw/include/bThread.hpp b/sw/include/bThread.hpp index 89cc05e8..e5660a6d 100644 --- a/sw/include/bThread.hpp +++ b/sw/include/bThread.hpp @@ -123,7 +123,7 @@ class bThread { */ // Constructor-Call - bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched = nullptr, void (*uisr)(int) = nullptr); + bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched = nullptr, void (*uisr)(int) = nullptr, bool encryption_required = false, bool compression_required = false, bool dpi_required = false); // Destructor-Call ~bThread(); diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp index 2db70fc2..5cd50148 100644 --- a/sw/include/cDefs.hpp +++ b/sw/include/cDefs.hpp @@ -500,7 +500,7 @@ struct csAlloc { /** * Queue pairs - */ + */ // One queue - a queue pair has a local and a remote copy of this struct ibvQ { @@ -519,6 +519,11 @@ struct ibvQ { // Global ID for identifying a network interface in RDMA-networks (either InfiniBand or RoCE). For us, it's mostly a concatination of repeated IP-addresses char gid[33] = { 0 }; + // Balboa capabilities: AES-key, compression-bit and and DPI-bit + __uint128_t aes_key; + bool compression_enabled; + bool dpi_enabled; + // Converter GID to integer uint32_t gidToUint(int idx) { if(idx > 24) { @@ -541,13 +546,16 @@ struct ibvQ { } void print(const char *name) { - printf("%s: QPN 0x%06x, PSN 0x%06x, VADDR %016lx, SIZE %08x, IP 0x%08x\n", - name, qpn, psn, (uint64_t)vaddr, size, ip_addr); + uint64_t aes_high = (uint64_t)(aes_key >> 64); + uint64_t aes_low = (uint64_t)(aes_key); + + printf("%s: QPN 0x%06x, PSN 0x%06x, VADDR %016lx, SIZE %08x, IP 0x%08x\n, AES-key 0x%lx%016lx\n, Compression %d\n, DPI %d\n", + name, qpn, psn, (uint64_t)vaddr, size, ip_addr, aes_high, aes_low, compression_enabled, dpi_enabled); } }; /** - * Queue pair - combination of a local and a remote ibvQ + * Queue pair - combination of a local and a remote ibvQ e */ struct ibvQp { public: diff --git a/sw/include/cLib.hpp b/sw/include/cLib.hpp index ff9d95c3..17186d23 100644 --- a/sw/include/cLib.hpp +++ b/sw/include/cLib.hpp @@ -183,6 +183,27 @@ class cLib { // Received remote QP is located in the receive buffer and is getting copied over to the thread, which manages all QPs memcpy(&cthread->getQpair()->remote, recv_buff, sizeof(ibvQ)); + // Negotiate the Balboa-capabilities by comparing local and remote queue + + // AES-encryption: The larger aes-key becomes the common one. If both AES-keys are set to 0, no encryption is used for this QP + if(cthread->getQpair()->local.aes_key > cthread->getQpair()->remote.aes_key) { + cthread->getQpair()->remote.aes_key = cthread->getQpair()->local.aes_key; + } else { + cthread->getQpair()->local.aes_key = cthread->getQpair()->remote.aes_key; + } + + // Compression agreement: If at least one party wants compression, it is used for this communication flow + if(cthread->getQpair()->local.compression_enabled || cthread->getQpair()->remote.compression_enabled) { + cthread->getQpair()->remote.compression_enabled = true; + cthread->getQpair()->local.compression_enabled = true; + } + + // DPI agreement: If at least one party wants to use DPI, it is used for this communication flow + if(cthread->getQpair()->local.dpi_enabled || cthread->getQpair()->remote.dpi_enabled) { + cthread->getQpair()->remote.dpi_enabled = true; + cthread->getQpair()->local.dpi_enabled = true; + } + // Output: Print local and remote QPs std::cout << "Queue pair: " << std::endl; cthread->getQpair()->local.print("Local "); diff --git a/sw/src/bThread.cpp b/sw/src/bThread.cpp index b45ba32e..b1b95cdb 100644 --- a/sw/src/bThread.cpp +++ b/sw/src/bThread.cpp @@ -104,7 +104,7 @@ static unsigned seed = std::chrono::system_clock::now().time_since_epoch().count * * Constructor that sets variables for vfid, cscheduler and lastly the plock (enum open_or_create and a generated name) */ -bThread::bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched, void (*uisr)(int)) : vfid(vfid), csched(csched), +bThread::bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched, void (*uisr)(int), bool encryption_required, bool compression_required, bool dpi_required) : vfid(vfid), csched(csched), plock(open_or_create, ("vpga_mtx_user_" + std::to_string(vfid)).c_str()) { DBG3("bThread: opening vFPGA-" << vfid << ", hpid " << hpid); @@ -188,6 +188,7 @@ bThread::bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched, void (* // Random number generators std::default_random_engine rand_gen(seed); std::uniform_int_distribution distr(0, std::numeric_limits::max()); + std::uniform_int_distribution distr_aes(1, std::numeric_limits::max()); // Read the IP-address via a ioctl-system call and store it in tmp if (ioctl(fd, IOCTL_GET_IP_ADDRESS, &tmp)) @@ -208,6 +209,23 @@ bThread::bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched, void (* qpair->local.psn = distr(rand_gen) & 0xFFFFFF; // Generate a random PSN to start with on the local side qpair->local.rkey = 0; // Local rkey is hard-coded to 0 + // Balboa-capabilities + + // AES-Encryption + if(encryption_required) { + // If AES is required, create a random AES-key as part of the Queue + qpair->local.aes_key = distr_aes(rand_gen); + } else { + // If no AES-encryption is required, set the AES-key to 0. + qpair->local.aes_key = 0; + } + + // Compression-bit + qpair->local.compression_enabled = compression_required; + + // DPI-bit + qpair->local.dpi_enabled = dpi_required; + # ifdef VERBOSE std::cout << "bThread: RDMA is enabled, created the local QP with QPN " << qpair->local.qpn << ", local PSN " << qpair->local.psn << ", and local rkey " << qpair->local.rkey << "." << std::endl; # endif diff --git a/sw/src/cService.cpp b/sw/src/cService.cpp index fd7671d7..3b0b6f44 100644 --- a/sw/src/cService.cpp +++ b/sw/src/cService.cpp @@ -309,6 +309,28 @@ void cService::acceptConnectionRemote() { # endif cthread->getQpair()->remote = r_qp; // store the received remote QP + + // Negotiate the Balboa-capabilities by comparing local and remote queue + + // AES-encryption: The larger aes-key becomes the common one. If both AES-keys are set to 0, no encryption is used for this QP + if(cthread->getQpair()->local.aes_key > cthread->getQpair()->remote.aes_key) { + cthread->getQpair()->remote.aes_key = cthread->getQpair()->local.aes_key; + } else { + cthread->getQpair()->local.aes_key = cthread->getQpair()->remote.aes_key; + } + + // Compression agreement: If at least one party wants compression, it is used for this communication flow + if(cthread->getQpair()->local.compression_enabled || cthread->getQpair()->remote.compression_enabled) { + cthread->getQpair()->remote.compression_enabled = true; + cthread->getQpair()->local.compression_enabled = true; + } + + // DPI agreement: If at least one party wants to use DPI, it is used for this communication flow + if(cthread->getQpair()->local.dpi_enabled || cthread->getQpair()->remote.dpi_enabled) { + cthread->getQpair()->remote.dpi_enabled = true; + cthread->getQpair()->local.dpi_enabled = true; + } + cthread->getMem({CoyoteAlloc::HPF, r_qp.size, true}); // Allocate memory for receiving data for RDMA # ifdef VERBOSE