From c8b04ab731c5e84c03005d03119a67e2dace7cb4 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Tue, 3 Oct 2023 15:50:58 -0400 Subject: [PATCH] [LibOS] Delay IPC leader notification until it has 0 connections To avoid killing the IPC leader before all the child processes have died, delay notifying the leader that it should terminate until it has 0 connections. Signed-off-by: Stefan Berger --- libos/src/ipc/libos_ipc_worker.c | 43 ++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/libos/src/ipc/libos_ipc_worker.c b/libos/src/ipc/libos_ipc_worker.c index 667c889abd..43d648b83b 100644 --- a/libos/src/ipc/libos_ipc_worker.c +++ b/libos/src/ipc/libos_ipc_worker.c @@ -65,6 +65,8 @@ static ipc_callback ipc_callbacks[] = { [IPC_MSG_FILE_LOCK_CLEAR_PID] = ipc_file_lock_clear_pid_callback, }; +static PAL_HANDLE leader_notifier; + static void ipc_leader_died_callback(void) { /* This might happen legitimately e.g. if IPC leader is also our parent and does `wait` + `exit` * If this is an erroneous disconnect it will be noticed when trying to communicate with @@ -106,13 +108,17 @@ static int add_ipc_connection(PAL_HANDLE handle, IDTYPE id) { return 0; } -static void del_ipc_connection(struct libos_ipc_connection* conn) { +static void del_ipc_connection(struct libos_ipc_connection* conn, + PAL_HANDLE *notifier) { LISTP_DEL(conn, &g_ipc_connections, list); g_ipc_connections_cnt--; PalObjectDestroy(conn->handle); free(conn); + + if (notifier && g_ipc_connections_cnt == 0) + PalEventSet(*notifier); } /* @@ -213,7 +219,7 @@ static int receive_ipc_messages(struct libos_ipc_connection* conn) { return 0; } -static noreturn void ipc_worker_main(void) { +static noreturn void ipc_worker_main(PAL_HANDLE *notifier) { /* TODO: If we had a global array of connections (instead of a list) we wouldn't have to gather * them all here in every loop iteration, but then deletion would be slower (but deletion should * be rare). */ @@ -337,7 +343,7 @@ static noreturn void ipc_worker_main(void) { if (ret == 1) { /* Connection closed. */ disconnect_callbacks(conn); - del_ipc_connection(conn); + del_ipc_connection(conn, notifier); continue; } if (ret < 0) { @@ -351,12 +357,18 @@ static noreturn void ipc_worker_main(void) { * more time - in case there are messages left to be read. */ if (ret_events[i] == PAL_WAIT_ERROR) { disconnect_callbacks(conn); - del_ipc_connection(conn); + del_ipc_connection(conn, notifier); } } } out_die: + + if (notifier) { + g_ipc_connections_cnt = 0; + PalEventSet(*notifier); + } + PalProcessExit(1); } @@ -370,7 +382,17 @@ static int ipc_worker_wrapper(void* arg) { log_setprefix(libos_get_tcb()); log_debug("IPC worker started"); - ipc_worker_main(); + + PAL_HANDLE *notifier = NULL; + if (g_process_ipc_ids.self_vmid == STARTING_VMID) { + notifier = &leader_notifier; + /* IPC leader gets a notifier used in terminate_ipc_leader */ + if (PalEventCreate(notifier, false, false) < 0) { + log_error("PalEventCreate failed"); + return -1; + } + } + ipc_worker_main(notifier); /* Unreachable. */ } @@ -409,12 +431,23 @@ int init_ipc_worker(void) { } void terminate_ipc_worker(void) { + if (g_process_ipc_ids.self_vmid == STARTING_VMID) { + uint64_t timeout_us = 100*1000; + + PalEventClear(leader_notifier); + while (__atomic_load_n(&g_ipc_connections_cnt, __ATOMIC_ACQUIRE) > 0) { + PalEventWait(leader_notifier, &timeout_us); + if (!__atomic_load_n(&g_clear_on_worker_exit, __ATOMIC_ACQUIRE)) + goto end; + } + } set_pollable_event(&g_worker_thread->pollable_event); while (__atomic_load_n(&g_clear_on_worker_exit, __ATOMIC_ACQUIRE)) { CPU_RELAX(); } +end: put_thread(g_worker_thread); g_worker_thread = NULL; PalObjectDestroy(g_self_ipc_handle);