From 617d0aafa1db3900b44be7c1fe2520f7661eb5ad Mon Sep 17 00:00:00 2001 From: Kailun Qin Date: Fri, 26 Jan 2024 02:55:23 -0500 Subject: [PATCH] [WIP] [LibOS] Add support for timerfd system calls This commit adds support for system calls that create and operate on a timer that delivers timer expiration notifications via a file descriptor, specifically: `timerfd_create()`, `timerfd_settime()` and `timerfd_gettime()`. The timerfd object is associated with a dummy eventfd created on the host to trigger notifications (e.g., in epoll). The object is created inside Gramine, with all it operations resolved entirely inside Gramine. The emulation is currently implemented at the level of a single process. However, it may sometimes work for multi-process applications, e.g., if the child process inherits the timerfd object but doesn't use it; to support these cases, we introduce the `sys.experimental__allow_timerfd_fork` manifest option. LibOS regression tests are also added. Signed-off-by: Kailun Qin --- Documentation/devel/features.md | 28 +- Documentation/manifest-syntax.rst | 16 + libos/include/libos_fs.h | 5 + libos/include/libos_handle.h | 12 + libos/include/libos_internal.h | 4 + libos/include/libos_table.h | 4 + libos/include/libos_utils.h | 2 +- libos/include/linux_abi/timerfd.h | 17 + libos/src/arch/x86_64/libos_table.c | 6 +- libos/src/fs/libos_fs.c | 1 + libos/src/fs/proc/thread.c | 1 + libos/src/fs/timerfd/fs.c | 124 +++++++ libos/src/libos_async.c | 46 ++- libos/src/libos_init.c | 2 + libos/src/libos_parser.c | 10 +- libos/src/meson.build | 2 + libos/src/sys/libos_alarm.c | 6 +- libos/src/sys/libos_clone.c | 17 + libos/src/sys/libos_epoll.c | 1 + libos/src/sys/libos_exit.c | 3 +- libos/src/sys/libos_ioctl.c | 3 +- libos/src/sys/libos_poll.c | 5 + libos/src/sys/libos_timerfd.c | 270 ++++++++++++++++ libos/test/ltp/ltp.cfg | 4 +- libos/test/ltp/ltp_bug_1075.cfg | 3 + libos/test/regression/meson.build | 2 + libos/test/regression/test_libos.py | 20 ++ libos/test/regression/tests.toml | 3 + libos/test/regression/tests_musl.toml | 4 + libos/test/regression/timerfd.c | 302 ++++++++++++++++++ libos/test/regression/timerfd_fork.c | 66 ++++ ...rfd_fork_allowed_failing.manifest.template | 27 ++ .../timerfd_fork_disallowed.manifest.template | 23 ++ 33 files changed, 993 insertions(+), 46 deletions(-) create mode 100644 libos/include/linux_abi/timerfd.h create mode 100644 libos/src/fs/timerfd/fs.c create mode 100644 libos/src/sys/libos_timerfd.c create mode 100644 libos/test/regression/timerfd.c create mode 100644 libos/test/regression/timerfd_fork.c create mode 100644 libos/test/regression/timerfd_fork_allowed_failing.manifest.template create mode 100644 libos/test/regression/timerfd_fork_disallowed.manifest.template diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md index ab6dbb7717..a75f7bc900 100644 --- a/Documentation/devel/features.md +++ b/Documentation/devel/features.md @@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux - ☒ `signalfd()` [7](#signals-and-process-state-changes) -- ☒ `timerfd_create()` +- ▣ `timerfd_create()` [20](#sleeps-timers-and-alarms) - ▣ `eventfd()` @@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux - ▣ `fallocate()` [9a](#file-system-operations) -- ☒ `timerfd_settime()` +- ▣ `timerfd_settime()` [20](#sleeps-timers-and-alarms) -- ☒ `timerfd_gettime()` +- ▣ `timerfd_gettime()` [20](#sleeps-timers-and-alarms) - ☑ `accept4()` @@ -2862,9 +2862,21 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se Gramine implements alarm clocks via `alarm()`. +Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()` +and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are +resolved entirely inside Gramine. Each timerfd object is associated with a dummy eventfd created on +the host. This is purely for triggering read/write notifications (e.g., in epoll); timerfd data is +verified inside Gramine and is never exposed to the host. Since the host is used purely for +notifications, a malicious host can only induce Denial of Service (DoS) attacks. + +The emulation is currently implemented at the level of a single process. The emulation may work for +multi-process applications, e.g., if the child process inherits the timerfd object but doesn't use +it. However, multi-process support is brittle and thus disabled by default (Gramine will issue a +warning). To enable it still, set the [`sys.experimental__allow_timerfd_fork` manifest +option](../manifest-syntax.html#allowing-timerfd-in-multi-process-applications). + Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine -also does not currently implement timers that notify via file descriptors. Gramine could implement -these timers in the future, if need arises. +could implement it in the future, if need arises.
Related system calls @@ -2880,9 +2892,9 @@ these timers in the future, if need arises. - ☒ `timer_getoverrun()`: may be implemented in the future - ☒ `timer_delete()`: may be implemented in the future -- ☒ `timerfd_create()`: may be implemented in the future -- ☒ `timerfd_settime()`: may be implemented in the future -- ☒ `timerfd_gettime()`: may be implemented in the future +- ▣ `timerfd_create()`: see notes above +- ▣ `timerfd_settime()`: see notes above +- ▣ `timerfd_gettime()`: see notes above

diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst index 557f6e696b..bf1298d034 100644 --- a/Documentation/manifest-syntax.rst +++ b/Documentation/manifest-syntax.rst @@ -364,6 +364,22 @@ Python). Could be useful in SGX environments: child processes consume to achieve this, you need to run the whole Gramine inside a proper security sandbox. +.. _timerfd-in-multi-process: + +Allowing timerfd in multi-process applications +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + sys.experimental__allow_timerfd_fork = [true|false] + (Default: false) + +Gramine implements timerfd in a secure way, but this implementation works only +in single-process applications. If you have a multi-process application and you +are sure that the parent process and its child processes do not have +cross-process usage of timerfd, you can use +``sys.experimental__allow_timerfd_fork`` manifest syntax. + Root FS mount point ^^^^^^^^^^^^^^^^^^^ diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index f87c6f6595..1636eb121d 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -182,6 +182,10 @@ struct libos_fs_ops { /* Poll a single handle. Must not block. */ int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); + /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed + * ones. Used in e.g. secure timerfd FS. */ + void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events); + /* checkpoint/migrate the file system */ ssize_t (*checkpoint)(void** checkpoint, void* mount_data); int (*migrate)(void* checkpoint, void** mount_data); @@ -930,6 +934,7 @@ extern struct libos_fs eventfd_builtin_fs; extern struct libos_fs synthetic_builtin_fs; extern struct libos_fs path_builtin_fs; extern struct libos_fs shm_builtin_fs; +extern struct libos_fs timerfd_builtin_fs; struct libos_fs* find_fs(const char* name); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index 6cf9658a87..6fb0a7416f 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -46,6 +46,7 @@ enum libos_handle_type { /* Special handles: */ TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */ TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */ + TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */ }; struct libos_pipe_handle { @@ -134,6 +135,16 @@ struct libos_epoll_handle { size_t last_returned_index; }; +struct libos_timerfd_handle { + spinlock_t expiration_lock; /* protecting below fields */ + uint64_t num_expirations; + uint64_t dummy_host_val; + + spinlock_t timer_lock; + uint64_t timeout; + uint64_t reset; +}; + struct libos_handle { enum libos_handle_type type; bool is_dir; @@ -204,6 +215,7 @@ struct libos_handle { struct libos_epoll_handle epoll; /* TYPE_EPOLL */ struct { bool is_semaphore; } eventfd; /* TYPE_EVENTFD */ + struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */ } info; struct libos_dir_handle dir_info; diff --git a/libos/include/libos_internal.h b/libos/include/libos_internal.h index 9f713d56ae..a021fc4ad5 100644 --- a/libos/include/libos_internal.h +++ b/libos/include/libos_internal.h @@ -262,3 +262,7 @@ int init_stack(const char* const* argv, const char* const* envp, char*** out_arg * The implementation of this function depends on the used architecture. */ noreturn void call_elf_entry(elf_addr_t entry, void* argp); + +extern bool g_timerfd_allow_fork; +extern uint32_t g_timerfd_cnt; +int init_timerfd(void); diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index aa6aaf25c7..76cb882eae 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -206,3 +206,7 @@ long libos_syscall_getcpu(unsigned* cpu, unsigned* node, void* unused_cache); long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags); long libos_syscall_mlock2(unsigned long start, size_t len, int flags); long libos_syscall_sysinfo(struct sysinfo* info); +long libos_syscall_timerfd_create(int clockid, int flags); +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue); +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value); diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h index e3f4cb2e1f..f67b335775 100644 --- a/libos/include/libos_utils.h +++ b/libos/include/libos_utils.h @@ -53,7 +53,7 @@ int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vm /* Asynchronous event support */ int init_async_worker(void); -int64_t install_async_event(PAL_HANDLE object, unsigned long time, +int64_t install_async_event(PAL_HANDLE object, unsigned long time, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg); struct libos_thread* terminate_async_worker(void); diff --git a/libos/include/linux_abi/timerfd.h b/libos/include/linux_abi/timerfd.h new file mode 100644 index 0000000000..e1dfde46ea --- /dev/null +++ b/libos/include/linux_abi/timerfd.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +#pragma once + +/* Types and structures used by various Linux ABIs (e.g. syscalls). */ +/* These need to be binary-identical with the ones used by Linux. */ + +#include + +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c index 86147ec29e..3924825e63 100644 --- a/libos/src/arch/x86_64/libos_table.c +++ b/libos/src/arch/x86_64/libos_table.c @@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = { [__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat [__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait, [__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd - [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create + [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create, [__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd, [__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate, - [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime - [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime + [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime, + [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime, [__NR_accept4] = (libos_syscall_t)libos_syscall_accept4, [__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4 [__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2, diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c index 5a29a36d6d..f10aefd74b 100644 --- a/libos/src/fs/libos_fs.c +++ b/libos/src/fs/libos_fs.c @@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = { &synthetic_builtin_fs, &path_builtin_fs, &shm_builtin_fs, + &timerfd_builtin_fs, }; static struct libos_lock g_mount_mgr_lock; diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c index c3da147c48..ed1fa1a95a 100644 --- a/libos/src/fs/proc/thread.c +++ b/libos/src/fs/proc/thread.c @@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) { case TYPE_EPOLL: str = "epoll:[?]"; break; case TYPE_EVENTFD: str = "eventfd:[?]"; break; case TYPE_SHM: str = "shm:[?]"; break; + case TYPE_TIMERFD: str = "timerfd:[?]"; break; default: str = "unknown:[?]"; break; } return strdup(str); diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c new file mode 100644 index 0000000000..33673dc72d --- /dev/null +++ b/libos/src/fs/timerfd/fs.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * This file contains code for implementation of 'timerfd' filesystem. + */ + +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_lock.h" +#include "linux_abi/errors.h" +#include "pal.h" + +static void timerfd_dummy_host_read(struct libos_handle* hdl, uint64_t* out_host_val) { + uint64_t buf_dummy_host_val = 0; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + + int ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* should not happen in benign case, but can happen under racing, e.g. threads may race on + * the same eventfd event, one of them wins and updates `dummy_host_val` and the other one + * looses and gets an unexpected `dummy_host_val` */ + log_warning("timerfd dummy host read failed or got unexpected value"); + return; + } + + if (out_host_val) + *out_host_val = buf_dummy_host_val; +} + +static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) { + __UNUSED(pos); + assert(hdl->type == TYPE_TIMERFD); + + if (count < sizeof(uint64_t)) + return -EINVAL; + + int ret; + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + while (!hdl->info.timerfd.num_expirations) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + /* must block -- use the host's blocking read() on a dummy eventfd */ + if (hdl->info.timerfd.dummy_host_val) { + /* value on host is non-zero, must perform a read to make it zero (and thus the next + * read will become blocking) */ + uint64_t host_val = 0; + timerfd_dummy_host_read(hdl, &host_val); + if (host_val != hdl->info.timerfd.dummy_host_val) + BUG(); + hdl->info.timerfd.dummy_host_val = 0; + } + + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + /* blocking read to wait for some value (we don't care which value) */ + timerfd_dummy_host_read(hdl, /*out_host_val=*/NULL); + spinlock_lock(&hdl->info.timerfd.expiration_lock); + hdl->info.timerfd.dummy_host_val = 0; + } + + memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t)); + hdl->info.timerfd.num_expirations = 0; + + /* perform a read (not supposed to block) to clear the event from writing/polling threads */ + if (hdl->info.timerfd.dummy_host_val) { + timerfd_dummy_host_read(hdl, /*out_host_val=*/NULL); + hdl->info.timerfd.dummy_host_val = 0; + } + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); + return ret; +} + +static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) { + assert(hdl->type == TYPE_TIMERFD); + + if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)) { + /* impossible: we control eventfd inside the LibOS, and we never raise such conditions */ + BUG(); + } + + spinlock_lock(&hdl->info.timerfd.expiration_lock); + if (*pal_ret_events & PAL_WAIT_READ) { + /* there is data to read: verify if counter has value greater than zero */ + if (!hdl->info.timerfd.num_expirations) { + /* spurious or malicious notification -- for now we don't BUG but ignore it */ + *pal_ret_events &= ~PAL_WAIT_READ; + } + } + if (*pal_ret_events & PAL_WAIT_WRITE) { + /* spurious or malicious notification */ + BUG(); + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +static int timerfd_close(struct libos_handle* hdl) { + __UNUSED(hdl); + + /* see `libos_timerfd.c` for the handle-open counterpart */ + (void)__atomic_sub_fetch(&g_timerfd_cnt, 1, __ATOMIC_ACQ_REL); + return 0; +} + +struct libos_fs_ops timerfd_fs_ops = { + .read = &timerfd_read, + .close = &timerfd_close, + .post_poll = &timerfd_post_poll, +}; + +struct libos_fs timerfd_builtin_fs = { + .name = "timerfd", + .fs_ops = &timerfd_fs_ops, +}; diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c index 81879393f9..fbebd4b810 100644 --- a/libos/src/libos_async.c +++ b/libos/src/libos_async.c @@ -23,8 +23,9 @@ struct async_event { LIST_TYPE(async_event) triggered_list; void (*callback)(IDTYPE caller, void* arg); void* arg; - PAL_HANDLE object; /* handle (async IO) to wait on */ - uint64_t expire_time; /* alarm/timer to wait on */ + PAL_HANDLE object; /* handle (async IO) to wait on */ + PAL_HANDLE timer_object; /* handle to identify timer object; currently used for timerfd */ + uint64_t expire_time; /* alarm/timer to wait on */ }; DEFINE_LISTP(async_event); static LISTP_TYPE(async_event) async_list; @@ -40,26 +41,22 @@ static struct libos_pollable_event install_new_event; static int create_async_worker(void); -/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC) - * using this function. These events are enqueued in async_list and delivered - * to async worker thread by triggering install_new_event. When event is - * triggered in async worker thread, the corresponding event's callback with - * arguments `arg` is called. This callback typically sends a signal to the +/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using + * this function. These events are enqueued in async_list and delivered to async worker thread by + * triggering install_new_event. When event is triggered in async worker thread, the corresponding + * event's callback with arguments `arg` is called. This callback typically sends a signal to the * thread which registered the event (saved in `event->caller`). * * We distinguish between alarm/timer events and async IO events: - * - alarm/timer events set object = NULL and time = seconds - * (time = 0 cancels all pending alarms/timers). + * - alarm/timer events set time = seconds (time = 0 cancels all pending alarms/timers). + * Specfically when object != NULL and time != 0, this indicates a timerfd event. * - async IO events set object = handle and time = 0. * - * Function returns remaining usecs for alarm/timer events (same as alarm()) - * or 0 for async IO events. On error, it returns a negated error code. + * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO + * events. On error, it returns a negated error code. */ -int64_t install_async_event(PAL_HANDLE object, uint64_t time, +int64_t install_async_event(PAL_HANDLE object, uint64_t time, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg) { - /* if event happens on object, time must be zero */ - assert(!object || (object && !time)); - uint64_t now = 0; int ret = PalSystemTimeQuery(&now); if (ret < 0) { @@ -73,21 +70,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time, return -ENOMEM; } - event->callback = callback; - event->arg = arg; - event->caller = get_cur_tid(); - event->object = object; - event->expire_time = time ? now + time : 0; + event->callback = callback; + event->arg = arg; + event->caller = get_cur_tid(); + event->object = time ? NULL : object; + event->timer_object = (object && time) ? object : NULL; + event->expire_time = time ? (absolute_time ? time : now + time) : 0; lock(&async_worker_lock); - if (callback != &cleanup_thread && !object) { - /* This is alarm() or setitimer() emulation, treat both according to - * alarm() syscall semantics: cancel any pending alarm/timer. */ + if (callback != &cleanup_thread && (!object || event->timer_object)) { + /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm() + * syscall semantics: cancel any pending alarm/timer. */ struct async_event* tmp; struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->expire_time) { + if (tmp->timer_object == object && tmp->expire_time) { /* this is a pending alarm/timer, cancel it and save its expiration time */ if (max_prev_expire_time < tmp->expire_time) max_prev_expire_time = tmp->expire_time; diff --git a/libos/src/libos_init.c b/libos/src/libos_init.c index 332c9f4694..29a5d5194d 100644 --- a/libos/src/libos_init.c +++ b/libos/src/libos_init.c @@ -502,6 +502,8 @@ noreturn void libos_init(const char* const* argv, const char* const* envp) { RUN_INIT(set_hostname, g_pal_public_state->dns_host.hostname, strlen(g_pal_public_state->dns_host.hostname)); + RUN_INIT(init_timerfd); + log_debug("LibOS initialized"); libos_tcb_t* cur_tcb = libos_get_tcb(); diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c index e9de40c9cd..dbc4fbbbf9 100644 --- a/libos/src/libos_parser.c +++ b/libos/src/libos_parser.c @@ -515,13 +515,17 @@ struct parser_table { parse_integer_arg, parse_pointer_arg, parse_integer_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg}}, [__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}}, - [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}}, + [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg, + parse_integer_arg, parse_integer_arg}}, [__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg, parse_integer_arg}}, [__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg, parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}}, - [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}}, - [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}}, + [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg, + parse_integer_arg, parse_integer_arg, parse_pointer_arg, + parse_pointer_arg}}, + [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg, + parse_integer_arg, parse_pointer_arg}}, [__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg, parse_integer_arg}}, [__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}}, diff --git a/libos/src/meson.build b/libos/src/meson.build index b9946bc2af..3bd8a72f28 100644 --- a/libos/src/meson.build +++ b/libos/src/meson.build @@ -44,6 +44,7 @@ libos_sources = files( 'fs/sys/fs.c', 'fs/sys/node_info.c', 'fs/tmpfs/fs.c', + 'fs/timerfd/fs.c', 'gramine_hash.c', 'ipc/libos_ipc.c', 'ipc/libos_ipc_child.c', @@ -101,6 +102,7 @@ libos_sources = files( 'sys/libos_socket.c', 'sys/libos_stat.c', 'sys/libos_time.c', + 'sys/libos_timerfd.c', 'sys/libos_uname.c', 'sys/libos_wait.c', 'sys/libos_wrappers.c', diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c index f4bc5fbe21..28d5e0836d 100644 --- a/libos/src/sys/libos_alarm.c +++ b/libos/src/sys/libos_alarm.c @@ -35,7 +35,7 @@ static void signal_alarm(IDTYPE caller, void* arg) { long libos_syscall_alarm(unsigned int seconds) { uint64_t usecs = 1000000ULL * seconds; - int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL); + int64_t ret = install_async_event(NULL, usecs, /*absolute_time=*/false, &signal_alarm, NULL); if (ret < 0) return ret; @@ -105,8 +105,8 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value, : 0; uint64_t current_reset = g_real_itimer.reset; - int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, - (void*)(setup_time + next_value)); + int64_t install_ret = install_async_event(NULL, next_value, /*absolute_time=*/false, + &signal_itimer, (void*)(setup_time + next_value)); if (install_ret < 0) { spinlock_unlock(&g_real_itimer_lock); diff --git a/libos/src/sys/libos_clone.c b/libos/src/sys/libos_clone.c index f8636a0428..6ae88a2945 100644 --- a/libos/src/sys/libos_clone.c +++ b/libos/src/sys/libos_clone.c @@ -329,6 +329,23 @@ long libos_syscall_clone(unsigned long flags, unsigned long user_stack_addr, int } return -EAGAIN; } + + if (!g_timerfd_allow_fork && __atomic_load_n(&g_timerfd_cnt, __ATOMIC_ACQUIRE)) { + /* + * Emulated (secure) mode of timerfd is currently only single-process. If the manifest + * doesn't specify explicitly that the fork is allowed for a process that created some + * timerfds, disallow this clone/fork. In some cases, however, the child process doesn't + * use timerfds of parent, and then clone/fork is not a problem; this is brittle so it + * is disabled by default (and enabled via `sys.experimental__allow_timerfd_fork`). + */ + if (FIRST_TIME()) { + log_warning("The app tried to create a subprocess, but this is disallowed because " + "timerfd is emulated in a secure single-process mode, there is at " + "least one timerfd object existing and the manifest option " + "'sys.experimental__allow_timerfd_fork' is not set"); + } + return -EAGAIN; + } } struct libos_thread* thread = get_new_thread(); diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c index fab6965eee..f3973f7c8c 100644 --- a/libos/src/sys/libos_epoll.c +++ b/libos/src/sys/libos_epoll.c @@ -461,6 +461,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event case TYPE_PIPE: case TYPE_SOCK: case TYPE_EVENTFD: + case TYPE_TIMERFD: break; default: /* epoll not supported by this type of handle */ diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c index df21ec1168..79554e23fe 100644 --- a/libos/src/sys/libos_exit.c +++ b/libos/src/sys/libos_exit.c @@ -108,7 +108,8 @@ noreturn void thread_exit(int error_code, int term_signal) { cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */ /* We pass this ownership to `cleanup_thread`. */ get_thread(cur_thread); - int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread); + int64_t ret = install_async_event(NULL, 0, /*absolute_time=*/false, &cleanup_thread, + cur_thread); /* Take the reference to the current thread from the tcb. */ lock(&cur_thread->lock); diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c index 89d5424da9..6d7413bd99 100644 --- a/libos/src/sys/libos_ioctl.c +++ b/libos/src/sys/libos_ioctl.c @@ -104,7 +104,8 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { rwlock_write_unlock(&handle_map->lock); break; case FIOASYNC: - ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL); + ret = install_async_event(hdl->pal_handle, 0, /*absolute_time=*/false, &signal_io, + NULL); break; case FIONREAD: { if (!is_user_memory_writable((void*)arg, sizeof(int))) { diff --git a/libos/src/sys/libos_poll.c b/libos/src/sys/libos_poll.c index 734b7a7242..714177e742 100644 --- a/libos/src/sys/libos_poll.c +++ b/libos/src/sys/libos_poll.c @@ -200,6 +200,11 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) { continue; } + if (libos_handles[i]->fs && libos_handles[i]->fs->fs_ops + && libos_handles[i]->fs->fs_ops->post_poll) { + libos_handles[i]->fs->fs_ops->post_poll(libos_handles[i], &ret_events[i]); + } + fds[i].revents = 0; if (ret_events[i] & PAL_WAIT_ERROR) fds[i].revents |= POLLERR; diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c new file mode 100644 index 0000000000..0d25b7be65 --- /dev/null +++ b/libos/src/sys/libos_timerfd.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Implementation of "timerfd" system calls. + * + * The timerfd object is created inside Gramine, and all operations are resolved entirely inside + * Gramine. Each timerfd object is associated with a dummy eventfd created on the host. This is + * purely for triggering read/write notifications (e.g., in epoll); timerfd data is verified inside + * Gramine and is never exposed to the host. Since the host is used purely for notifications, a + * malicious host can only induce Denial of Service (DoS) attacks. The dummy eventfd object is + * hardened following the similar approaches as Gramine's `eventfd`/`eventfd2` syscall + * implementation, see "libos/src/sys/libos_eventfd.c" for details. + * + * The emulation is currently implemented at the level of a single process. The emulation may work + * for multi-process applications, e.g., if the child process inherits the timerfd object but + * doesn't use it. However, multi-process support is brittle and thus disabled by default (Gramine + * will issue a warning). To enable it still, set the manifest option + * `sys.experimental__allow_timerfd_fork`. + */ + +#include "libos_checkpoint.h" +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_table.h" +#include "libos_utils.h" +#include "linux_abi/fs.h" +#include "linux_abi/timerfd.h" +#include "linux_eventfd.h" +#include "pal.h" +#include "toml_utils.h" + +bool g_timerfd_allow_fork __attribute_migratable = false; + +/* atomic per-process number of currently existing timerfds, used in `libos_clone.c` */ +uint32_t g_timerfd_cnt = 0; + +int init_timerfd(void) { + int ret; + + assert(g_manifest_root); + ret = toml_bool_in(g_manifest_root, "sys.experimental__allow_timerfd_fork", + /*defaultval=*/false, &g_timerfd_allow_fork); + if (ret < 0) { + log_error("Cannot parse 'sys.experimental__allow_timerfd_fork' (the value must be `true` " + "or `false`)"); + return -EINVAL; + } + + return 0; +} + +static void timerfd_dummy_host_write(struct libos_handle* hdl, uint64_t host_val) { + uint64_t buf_dummy_host_val = host_val; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + + int ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) { + int ret; + + PAL_HANDLE hdl = NULL; + + ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0, + PAL_CREATE_IGNORED, /*options=*/0, &hdl); + if (ret < 0) { + log_error("timerfd: dummy host eventfd creation failure"); + return pal_to_unix_errno(ret); + } + + /* see `fs/timerfd/fs.c` for the handle-close counterpart */ + (void)__atomic_add_fetch(&g_timerfd_cnt, 1, __ATOMIC_ACQ_REL); + + *out_pal_handle = hdl; + return 0; +} + +long libos_syscall_timerfd_create(int clockid, int flags) { + int ret; + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && + clockid != CLOCK_BOOTTIME_ALARM)) + return -EINVAL; + + if (clockid != CLOCK_REALTIME) { + if (FIRST_TIME()) { + log_warning("Unsupported clockid; replaced by the system-wide real-time clock."); + } + } + + struct libos_handle* hdl = get_new_handle(); + if (!hdl) + return -ENOMEM; + + hdl->type = TYPE_TIMERFD; + hdl->fs = &timerfd_builtin_fs; + hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0); + hdl->acc_mode = MAY_READ; + + ret = create_timerfd_pal_handle(&hdl->pal_handle); + if (ret < 0) + goto out; + + ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL); +out: + put_handle(hdl); + return ret; +} + +static void timerfd_update(struct libos_handle* hdl) { + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + if (hdl->info.timerfd.num_expirations < UINT64_MAX) { + hdl->info.timerfd.num_expirations++; + hdl->info.timerfd.dummy_host_val++; + + /* perform a write (not supposed to block) to send an event to reading/polling threads */ + timerfd_dummy_host_write(hdl, /*host_val=*/1); + } + + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + + maybe_epoll_et_trigger(hdl, /*ret=*/0, /*in=*/false, /*unused was_partial=*/false); +} + +static void callback_itimer(IDTYPE caller, void* arg) { + // XXX: Can we simplify this code or streamline with the other callback? + __UNUSED(caller); + + struct libos_handle* hdl = (struct libos_handle*)arg; + + spinlock_lock(&hdl->info.timerfd.timer_lock); + hdl->info.timerfd.timeout += hdl->info.timerfd.reset; + uint64_t next_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_reset) + install_async_event(hdl->pal_handle, next_reset, /*absolute_time=*/false, &callback_itimer, + (void*)hdl); + + timerfd_update(hdl); +} + +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) { + return -EBADF; + } + + if (!is_user_memory_readable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) { + ret = -EFAULT; + goto out; + } + + if (flags & ~TFD_SETTIME_FLAGS) { + ret = -EINVAL; + goto out; + } + + /* NOTE: cancelable timer (for the case where reads on timerfd would return `ECANCELED` when the + * real-time clock undergoes a discontinuous change) is currently unsupported; needs to be + * specified along with `TFD_TIMER_ABSTIME`. */ + if (flags & TFD_TIMER_CANCEL_ON_SET) { + ret = -ENOSYS; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + uint64_t next_value = timespec_to_us(&value->it_value); + uint64_t next_reset = timespec_to_us(&value->it_interval); + + spinlock_lock(&hdl->info.timerfd.timer_lock); + + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + + bool absolute_time = flags & TFD_TIMER_ABSTIME; + if (absolute_time) { + hdl->info.timerfd.timeout = next_value; + } else { + hdl->info.timerfd.timeout = setup_time + next_value; + } + hdl->info.timerfd.reset = next_reset; + + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_value) { + int64_t install_ret = install_async_event(hdl->pal_handle, next_value, absolute_time, + &callback_itimer, (void*)hdl); + if (install_ret < 0) { + ret = install_ret; + goto out; + } + } + + if (ovalue) { + ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S; + ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S; + ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + } + + ret = 0; +out: + put_handle(hdl); + return ret; +} + +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) { + return -EBADF; + } + + if (!is_user_memory_writable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + spinlock_lock(&hdl->info.timerfd.timer_lock); + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + value->it_interval.tv_sec = current_reset / TIME_US_IN_S; + value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + value->it_value.tv_sec = current_timeout / TIME_US_IN_S; + value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + + ret = 0; +out: + put_handle(hdl); + return ret; +} diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index e40899a1b9..7c2c98c62d 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -2439,8 +2439,8 @@ skip = yes [timer_settime*] skip = yes -# no timerfd -[timerfd*] +# clocks other than `CLOCK_REALTIME` are not supported +[timerfd04] skip = yes [times03] diff --git a/libos/test/ltp/ltp_bug_1075.cfg b/libos/test/ltp/ltp_bug_1075.cfg index d01a18ee51..eb04e42d1c 100644 --- a/libos/test/ltp/ltp_bug_1075.cfg +++ b/libos/test/ltp/ltp_bug_1075.cfg @@ -919,6 +919,9 @@ skip = yes [tgkill03] skip = yes +[timerfd01] +skip = yes + [umask01] skip = yes diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index cbd15b4443..8b8d000862 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -144,6 +144,8 @@ tests = { 'tcp_einprogress': {}, 'tcp_ipv6_v6only': {}, 'tcp_msg_peek': {}, + 'timerfd': {}, + 'timerfd_fork': {}, 'udp': {}, 'uid_gid': {}, 'unix': {}, diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index dbfdff2a82..30fc9fb0be 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -992,6 +992,26 @@ def test_140_flock_lock(self): os.remove('tmp/flock_file2') self.assertIn('TEST OK', stdout) + def test_150_timerfd(self): + stdout, _ = self.run_binary(['timerfd'], timeout=120) + self.assertIn("TEST OK", stdout) + + def test_151_timerfd_fork_disallowed(self): + try: + self.run_binary(['timerfd_fork_disallowed']) + self.fail('timerfd_fork_disallowed unexpectedly succeeded') + except subprocess.CalledProcessError as e: + stderr = e.stderr.decode() + self.assertIn('The app tried to create a subprocess, but this is disallowed because ' + 'timerfd is emulated in a secure single-process mode', stderr) + + def test_152_timerfd_fork_allowed_failing(self): + try: + self.run_binary(['timerfd_fork_allowed_failing']) + self.fail('timerfd_fork_allowed_failing unexpectedly succeeded') + except AssertionError as e: + self.assertIn('timed out', e.args[0]) + class TC_31_Syscall(RegressionTestCase): def test_000_syscall_redirect(self): stdout, _ = self.run_binary(['syscall']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 1c8f858ba8..b1d6c7f008 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -121,6 +121,9 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork_disallowed", + "timerfd_fork_allowed_failing", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index f0803fb384..b0db07a302 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -122,6 +122,10 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", + "timerfd_fork_disallowed", + "timerfd_fork_allowed_failing", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c new file mode 100644 index 0000000000..17704ac1c7 --- /dev/null +++ b/libos/test/regression/timerfd.c @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/selecting + * on timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple + * threads.. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define EXPECTED_EXPIRATIONS 1 +#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5 +#define NUM_FDS 2 +#define NUM_THREADS 5 +#define PERIODIC_INTERVAL 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd_relative(int fd, bool periodic) { + struct itimerspec new_value; + + new_value.it_value.tv_sec = TIMEOUT_VALUE; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0; + new_value.it_interval.tv_nsec = 0; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) { + for (int i = 0; i < NUM_FDS; i++) + set_timerfd_relative(fds[i], periodic); +} + +static void set_timerfd_absolute(int fd, struct timespec* abs_time) { + struct itimerspec new_value; + + /* Set the timer to expire at the absolute time specified */ + new_value.it_value.tv_sec = abs_time->tv_sec; + new_value.it_value.tv_nsec = abs_time->tv_nsec; + new_value.it_interval.tv_sec = 0; + new_value.it_interval.tv_nsec = 0; + + /* Set the timer to absolute time */ + CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL)); +} + +static void create_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0)); +} + +static void close_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + CHECK(close(fds[i])); +} + +static void test_select(int fds[NUM_FDS]) { + fd_set rfds; + FD_ZERO(&rfds); + for (int i = 0; i < NUM_FDS; i++) { + FD_SET(fds[i], &rfds); + } + + int max_fd = MAX(fds[0], fds[1]) + 1; + CHECK(select(max_fd, &rfds, NULL, NULL, NULL)); + + for (int i = 0; i < NUM_FDS; i++) { + if (FD_ISSET(fds[i], &rfds)) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "select: unexpected number of expirations (expected 1, got %lu)", + expirations); + } + } +} + +static void test_poll(int fds[NUM_FDS]) { + struct pollfd pfds[NUM_FDS]; + for (int i = 0; i < NUM_FDS; i++) { + pfds[i].fd = fds[i]; + pfds[i].events = POLLIN; + } + + CHECK(poll(pfds, NUM_FDS, -1)); + + for (int i = 0; i < NUM_FDS; i++) { + if (pfds[i].revents & POLLIN) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "poll: unexpected number of expirations (expected 1, got %lu)", + expirations); + } + } +} + +static void test_epoll(int fds[NUM_FDS]) { + int epfd = CHECK(epoll_create1(0)); + + struct epoll_event ev; + ev.events = EPOLLIN; + for (int i = 0; i < NUM_FDS; i++) { + ev.data.fd = fds[i]; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev)); + } + + struct epoll_event events[NUM_FDS]; + int nfds = CHECK(epoll_wait(epfd, events, NUM_FDS, -1)); + + for (int n = 0; n < nfds; ++n) { + uint64_t expirations; + CHECK(read(events[n].data.fd, &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "epoll: unexpected number of expirations (expected 1, got %lu)", expirations); + } + + close(epfd); +} + +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +static size_t expiration_count = 0; + +static void* timerfd_read_thread_periodic_timer(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + + for (;;) { + CHECK(read(fd, &expirations, sizeof(expirations))); + pthread_mutex_lock(&mutex); + expiration_count += expirations; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + } + + return NULL; +} + +static void test_periodic_timer(int fd) { + pthread_t thread; + CHECK(pthread_create(&thread, NULL, timerfd_read_thread_periodic_timer, &fd)); + + /* wait for at least 5 expirations */ + pthread_mutex_lock(&mutex); + while (expiration_count < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) { + pthread_cond_wait(&cond, &mutex); + } + pthread_mutex_unlock(&mutex); + + if (expiration_count != EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) + errx(1, "periodic_timer: unexpected number of expirations (expected 5, got %lu)", + expiration_count); + + /* cleanup: cancel the read thread and wait for it to exit */ + CHECK(pthread_cancel(thread)); + CHECK(pthread_join(thread, NULL)); +} + +static void* timerfd_read_thread(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations == 0) + err(1, "threaded read: unexpected number of expirations"); + pthread_exit(NULL); +} + +static void test_threaded_read(int fd) { + pthread_t threads[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd)); + /* wait for the thread to finish */ + CHECK(pthread_join(threads[i], NULL)); + } +} + +static void test_timerfd_gettime(int fd) { + struct itimerspec curr_value; + CHECK(timerfd_gettime(fd, &curr_value)); + + /* the timer should be set to expire close to 2 seconds */ + if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 || + curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) { + errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)", + curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec); + } +} + +static void test_absolute_time(int fd) { + struct timespec now; + struct timespec abs_time; + uint64_t expirations; + + /* test timerfd with absolute time set in the future */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + + expirations = 0; + memset(&now, 0, sizeof(struct timespec)); + memset(&abs_time, 0, sizeof(struct timespec)); + + /* test timerfd with absolute time set in the past */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } +} + +static void test_read(int fd, bool non_blocking) { + if (non_blocking) { + CHECK(fcntl(fd, F_SETFL, O_NONBLOCK)); + } + + uint64_t expirations; + int retval = read(fd, &expirations, sizeof(expirations)); + + if (non_blocking) { + if (retval != -1 || errno != EAGAIN) { + errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN", + retval, errno); + } + } else { + CHECK(retval); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "read: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + } +} + +int main(void) { + int fds[NUM_FDS]; + create_timerfds(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_select(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_poll(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_epoll(fds); + + set_timerfd_relative(fds[0], /*periodic*/true); + test_periodic_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/true); + test_threaded_read(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/false); + test_timerfd_gettime(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/false); + test_read(fds[0], /*non_blocking=*/false); + test_read(fds[0], /*non_blocking=*/true); + + test_absolute_time(fds[1]); + + close_timerfds(fds); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c new file mode 100644 index 0000000000..dd7f58b89b --- /dev/null +++ b/libos/test/regression/timerfd_fork.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Multi-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * Note that timerfd is currently only emulated in a secure single-process mode, so this test does + * not work. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd(int fd) { + struct itimerspec new_value; + + new_value.it_value.tv_sec = TIMEOUT_VALUE; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = 0; + new_value.it_interval.tv_nsec = 0; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void test_multi_process(int fd) { + pid_t pid = CHECK(fork()); + if (pid == 0) { + uint64_t expirations; + /* child: wait for the timer to expire and then read the timerfd */ + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "child process: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + exit(0); + } else { + /* process: do nothing and let the child process read the timerfd */ + /* wait for the child process to exit */ + CHECK(waitpid(pid, NULL, 0)); + } +} + +int main(void) { + int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0)); + + set_timerfd(fd); + test_multi_process(fd); + + CHECK(close(fd)); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/timerfd_fork_allowed_failing.manifest.template b/libos/test/regression/timerfd_fork_allowed_failing.manifest.template new file mode 100644 index 0000000000..b72c2f5e89 --- /dev/null +++ b/libos/test/regression/timerfd_fork_allowed_failing.manifest.template @@ -0,0 +1,27 @@ +{% set entrypoint = "timerfd_fork" -%} + +loader.entrypoint = "file:{{ gramine.libos }}" +libos.entrypoint = "{{ entrypoint }}" + +loader.log_level = "warning" + +loader.env.LD_LIBRARY_PATH = "/lib" +loader.insecure__use_cmdline_argv = true + +fs.mounts = [ + { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" }, + { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" }, +] + +# below line enables forking, but after the fork, the child reads its own version of timerfd object +# which will never expire (so the test fails) +sys.experimental__allow_timerfd_fork = true + +sgx.debug = true +sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} + +sgx.trusted_files = [ + "file:{{ gramine.libos }}", + "file:{{ gramine.runtimedir(libc) }}/", + "file:{{ binary_dir }}/{{ entrypoint }}", +] diff --git a/libos/test/regression/timerfd_fork_disallowed.manifest.template b/libos/test/regression/timerfd_fork_disallowed.manifest.template new file mode 100644 index 0000000000..e4ba1c1ba7 --- /dev/null +++ b/libos/test/regression/timerfd_fork_disallowed.manifest.template @@ -0,0 +1,23 @@ +{% set entrypoint = "timerfd_fork" -%} + +loader.entrypoint = "file:{{ gramine.libos }}" +libos.entrypoint = "{{ entrypoint }}" + +loader.log_level = "warning" + +loader.env.LD_LIBRARY_PATH = "/lib" +loader.insecure__use_cmdline_argv = true + +fs.mounts = [ + { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" }, + { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" }, +] + +sgx.debug = true +sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} + +sgx.trusted_files = [ + "file:{{ gramine.libos }}", + "file:{{ gramine.runtimedir(libc) }}/", + "file:{{ binary_dir }}/{{ entrypoint }}", +]