diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md index 0d1434f8ef..fe61ea8367 100644 --- a/Documentation/devel/features.md +++ b/Documentation/devel/features.md @@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux - ☒ `signalfd()` [7](#signals-and-process-state-changes) -- ☒ `timerfd_create()` +- ▣ `timerfd_create()` [20](#sleeps-timers-and-alarms) - ▣ `eventfd()` @@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux - ▣ `fallocate()` [9a](#file-system-operations) -- ☒ `timerfd_settime()` +- ▣ `timerfd_settime()` [20](#sleeps-timers-and-alarms) -- ☒ `timerfd_gettime()` +- ▣ `timerfd_gettime()` [20](#sleeps-timers-and-alarms) - ☑ `accept4()` @@ -2871,9 +2871,20 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se Gramine implements alarm clocks via `alarm()`. +Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()` +and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are +resolved entirely inside Gramine. Each timerfd object is associated with a dummy eventfd created on +the host. This is purely for triggering read notifications (e.g., in epoll); timerfd data is +verified inside Gramine and is never exposed to the host. Since the host is used purely for +notifications, a malicious host can only induce Denial of Service (DoS) attacks. + +The emulation is currently implemented at the level of a single process. The emulation *may* work for +multi-process applications, e.g., if the child process inherits the timerfd object but doesn't use +it. However, all timerfds created in the parent process are marked as invalid in child processes, +i.e. inter-process timing signals via timerfds is not allowed. + Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine -also does not currently implement timers that notify via file descriptors. Gramine could implement -these timers in the future, if need arises. +could implement it in the future, if need arises.
Related system calls @@ -2889,9 +2900,9 @@ these timers in the future, if need arises. - ☒ `timer_getoverrun()`: may be implemented in the future - ☒ `timer_delete()`: may be implemented in the future -- ☒ `timerfd_create()`: may be implemented in the future -- ☒ `timerfd_settime()`: may be implemented in the future -- ☒ `timerfd_gettime()`: may be implemented in the future +- ▣ `timerfd_create()`: see notes above +- ▣ `timerfd_settime()`: see notes above +- ▣ `timerfd_gettime()`: see notes above

diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index 4018c7c948..875225c50a 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -183,7 +183,7 @@ struct libos_fs_ops { int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed - * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */ + * ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */ void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events); /* checkpoint/migrate the file system */ @@ -942,6 +942,7 @@ extern struct libos_fs eventfd_builtin_fs; extern struct libos_fs synthetic_builtin_fs; extern struct libos_fs path_builtin_fs; extern struct libos_fs shm_builtin_fs; +extern struct libos_fs timerfd_builtin_fs; struct libos_fs* find_fs(const char* name); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index 4ce281f4e0..615950a486 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -46,6 +46,7 @@ enum libos_handle_type { /* Special handles: */ TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */ TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */ + TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */ }; struct libos_pipe_handle { @@ -142,6 +143,18 @@ struct libos_eventfd_handle { uint64_t dummy_host_val; }; +struct libos_timerfd_handle { + bool broken_in_child; + + spinlock_t expiration_lock; /* protecting below fields */ + uint64_t num_expirations; + uint64_t dummy_host_val; + + spinlock_t timer_lock; /* protecting below fields */ + uint64_t timeout; + uint64_t reset; +}; + struct libos_handle { enum libos_handle_type type; bool is_dir; @@ -217,6 +230,8 @@ struct libos_handle { struct libos_epoll_handle epoll; /* TYPE_EPOLL */ struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */ + + struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */ } info; struct libos_dir_handle dir_info; @@ -232,7 +247,7 @@ struct libos_handle { * `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and * `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and * maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those - * handle types that are seekable (e.g. not on eventfds or pipes). */ + * handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */ struct libos_lock pos_lock; }; diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index 5e110aca58..4ee6df6fa4 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -207,3 +207,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags); long libos_syscall_mlock2(unsigned long start, size_t len, int flags); long libos_syscall_sysinfo(struct sysinfo* info); long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags); +long libos_syscall_timerfd_create(int clockid, int flags); +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue); +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value); diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h index e3f4cb2e1f..f408d9bf35 100644 --- a/libos/include/libos_utils.h +++ b/libos/include/libos_utils.h @@ -52,8 +52,14 @@ void clean_link_map_list(void); int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name); /* Asynchronous event support */ +enum async_event_type { + ASYNC_EVENT_TYPE_IO = 1, + ASYNC_EVENT_TYPE_ALARM_TIMER = 2, +}; + int init_async_worker(void); -int64_t install_async_event(PAL_HANDLE object, unsigned long time, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + unsigned long time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg); struct libos_thread* terminate_async_worker(void); diff --git a/libos/include/linux_abi/time.h b/libos/include/linux_abi/time.h index da848822de..f0e1f6d76d 100644 --- a/libos/include/linux_abi/time.h +++ b/libos/include/linux_abi/time.h @@ -37,3 +37,14 @@ struct __kernel_timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; + +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c index 9629423899..0e28c50b5c 100644 --- a/libos/src/arch/x86_64/libos_table.c +++ b/libos/src/arch/x86_64/libos_table.c @@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = { [__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat [__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait, [__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd - [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create + [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create, [__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd, [__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate, - [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime - [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime + [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime, + [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime, [__NR_accept4] = (libos_syscall_t)libos_syscall_accept4, [__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4 [__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2, diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c index 5a29a36d6d..f10aefd74b 100644 --- a/libos/src/fs/libos_fs.c +++ b/libos/src/fs/libos_fs.c @@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = { &synthetic_builtin_fs, &path_builtin_fs, &shm_builtin_fs, + &timerfd_builtin_fs, }; static struct libos_lock g_mount_mgr_lock; diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c index c3da147c48..ed1fa1a95a 100644 --- a/libos/src/fs/proc/thread.c +++ b/libos/src/fs/proc/thread.c @@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) { case TYPE_EPOLL: str = "epoll:[?]"; break; case TYPE_EVENTFD: str = "eventfd:[?]"; break; case TYPE_SHM: str = "shm:[?]"; break; + case TYPE_TIMERFD: str = "timerfd:[?]"; break; default: str = "unknown:[?]"; break; } return strdup(str); diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c new file mode 100644 index 0000000000..dfa74b9e58 --- /dev/null +++ b/libos/src/fs/timerfd/fs.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * This file contains code for implementation of "timerfd" filesystem. For more information, see + * `libos/src/sys/libos_timerfd.c`. + */ + +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_lock.h" +#include "linux_abi/errors.h" +#include "pal.h" + +/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in + * child processes, i.e. inter-process timing signals via timerfds is not allowed. This restriction + * is because LibOS doesn't yet implement sync between timerfd objects. */ +static int timerfd_checkin(struct libos_handle* hdl) { + assert(hdl->type == TYPE_TIMERFD); + hdl->info.timerfd.broken_in_child = true; + return 0; +} + +static void timerfd_dummy_host_read(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 0; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == -PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static void timerfd_dummy_host_wait(struct libos_handle* hdl) { + pal_wait_flags_t wait_for_events = PAL_WAIT_READ; + pal_wait_flags_t ret_events = 0; + int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL); + if (ret < 0 && ret != -PAL_ERROR_INTERRUPTED) { + BUG(); + } + (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */ +} + +static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) { + __UNUSED(pos); + assert(hdl->type == TYPE_TIMERFD); + + if (count < sizeof(uint64_t)) + return -EINVAL; + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + int ret; + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + while (!hdl->info.timerfd.num_expirations) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + timerfd_dummy_host_wait(hdl); + spinlock_lock(&hdl->info.timerfd.expiration_lock); + } + + memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t)); + hdl->info.timerfd.num_expirations = 0; + + /* perform a read (not supposed to block) to clear the event from polling threads and to send an + * event to writing threads */ + if (hdl->info.timerfd.dummy_host_val) { + timerfd_dummy_host_read(hdl); + hdl->info.timerfd.dummy_host_val = 0; + } + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); + return ret; +} + +static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) { + assert(hdl->type == TYPE_TIMERFD); + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + *pal_ret_events = PAL_WAIT_ERROR; + return; + } + + if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) { + /* impossible: we control timerfd inside the LibOS, and we never raise such conditions */ + BUG(); + } + + spinlock_lock(&hdl->info.timerfd.expiration_lock); + if (*pal_ret_events & PAL_WAIT_READ) { + /* there is data to read: verify if timerfd has number of expirations greater than zero */ + if (!hdl->info.timerfd.num_expirations) { + /* spurious or malicious notification, can legitimately happen if another thread + * consumed this event between this thread's poll wakeup and the post_poll callback; + * we currently choose to return a spurious notification to the user */ + *pal_ret_events &= ~PAL_WAIT_READ; + } + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +struct libos_fs_ops timerfd_fs_ops = { + .checkin = &timerfd_checkin, + .read = &timerfd_read, + .post_poll = &timerfd_post_poll, +}; + +struct libos_fs timerfd_builtin_fs = { + .name = "timerfd", + .fs_ops = &timerfd_fs_ops, +}; diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c index b1c0b66db2..988f45364a 100644 --- a/libos/src/libos_async.c +++ b/libos/src/libos_async.c @@ -18,12 +18,13 @@ DEFINE_LIST(async_event); struct async_event { + enum async_event_type type; IDTYPE caller; /* thread installing this event */ LIST_TYPE(async_event) list; LIST_TYPE(async_event) triggered_list; void (*callback)(IDTYPE caller, void* arg); void* arg; - PAL_HANDLE object; /* handle (async IO) to wait on */ + PAL_HANDLE object; /* handle (async IO or timerfd) to wait on */ uint64_t expire_time_us; /* alarm/timer to wait on */ }; DEFINE_LISTP(async_event); @@ -40,25 +41,29 @@ static struct libos_pollable_event install_new_event; static int create_async_worker(void); -/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC) - * using this function. These events are enqueued in async_list and delivered - * to async worker thread by triggering install_new_event. When event is - * triggered in async worker thread, the corresponding event's callback with - * arguments `arg` is called. This callback typically sends a signal to the +/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using + * this function. These events are enqueued in async_list and delivered to async worker thread by + * triggering install_new_event. When event is triggered in async worker thread, the corresponding + * event's callback with arguments `arg` is called. This callback typically sends a signal to the * thread which registered the event (saved in `event->caller`). * * We distinguish between alarm/timer events and async IO events: + * - alarm/timer events set time_us = microsseconds (time_us = 0 cancels all pending + * alarms/timers). + * Specfically when object != NULL and time_us != 0, this indicates a timerfd event. * - alarm/timer events set object = NULL and time_us = microseconds * (time_us = 0 cancels all pending alarms/timers). * - async IO events set object = handle and time_us = 0. * - * Function returns remaining usecs for alarm/timer events (same as alarm()) - * or 0 for async IO events. On error, it returns a negated error code. + * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO + * events. On error, it returns a negated error code. */ -int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + uint64_t time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg) { - /* if event happens on object, time_us must be zero */ - assert(!object || (object && !time_us)); + /* if event happens on async IO object, time_us must be zero */ + assert(type == ASYNC_EVENT_TYPE_ALARM_TIMER || + (type == ASYNC_EVENT_TYPE_IO && object && !time_us)); uint64_t now_us = 0; int ret = PalSystemTimeQuery(&now_us); @@ -73,21 +78,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, return -ENOMEM; } + event->type = type; event->callback = callback; event->arg = arg; event->caller = get_cur_tid(); event->object = object; - event->expire_time_us = time_us ? now_us + time_us : 0; + event->expire_time_us = time_us ? (absolute_time ? time_us : now_us + time_us) : 0; lock(&async_worker_lock); - if (callback != &cleanup_thread && !object) { - /* This is alarm() or setitimer() emulation, treat both according to - * alarm() syscall semantics: cancel any pending alarm/timer. */ + if (callback != &cleanup_thread && type == ASYNC_EVENT_TYPE_ALARM_TIMER) { + /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm() + * syscall semantics: cancel any pending alarm/timer. */ struct async_event* tmp; struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->expire_time_us) { + if (tmp->object == object && tmp->expire_time_us) { /* this is a pending alarm/timer, cancel it and save its expiration time */ if (max_prev_expire_time_us < tmp->expire_time_us) max_prev_expire_time_us = tmp->expire_time_us; @@ -208,7 +214,7 @@ static int libos_async_worker(void* arg) { bool other_event = false; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { /* repopulate `pals` with IO events and find the next expiring alarm/timer */ - if (tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_IO) { if (pals_cnt == pals_max_cnt) { /* grow `pals` to accommodate more objects */ PAL_HANDLE* tmp_pals = malloc(sizeof(*tmp_pals) * (1 + pals_max_cnt * 2)); @@ -244,7 +250,8 @@ static int libos_async_worker(void* arg) { pal_events[pals_cnt + 1] = PAL_WAIT_READ; ret_events[pals_cnt + 1] = 0; pals_cnt++; - } else if (tmp->expire_time_us && tmp->expire_time_us > now_us) { + } else if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER && tmp->expire_time_us && + tmp->expire_time_us > now_us) { if (!next_expire_time_us || next_expire_time_us > tmp->expire_time_us) { /* use time of the next expiring alarm/timer */ next_expire_time_us = tmp->expire_time_us; @@ -312,7 +319,7 @@ static int libos_async_worker(void* arg) { /* check if this event is an IO event found in async_list */ LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->object == pals[i]) { + if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object == pals[i]) { log_debug("Async IO event triggered at %lu", now_us); LISTP_ADD_TAIL(tmp, &triggered, triggered_list); break; @@ -342,7 +349,7 @@ static int libos_async_worker(void* arg) { LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &triggered, triggered_list) { LISTP_DEL(tmp, &triggered, triggered_list); tmp->callback(tmp->caller, tmp->arg); - if (!tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER) { /* this is a one-off exit-child or alarm/timer event */ free(tmp); } diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c index 883c07f2f0..37f906bc57 100644 --- a/libos/src/libos_parser.c +++ b/libos/src/libos_parser.c @@ -516,13 +516,17 @@ struct parser_table { parse_integer_arg, parse_pointer_arg, parse_integer_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg}}, [__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}}, - [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}}, + [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg, + parse_integer_arg, parse_integer_arg}}, [__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg, parse_integer_arg}}, [__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg, parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}}, - [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}}, - [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}}, + [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg, + parse_integer_arg, parse_integer_arg, parse_pointer_arg, + parse_pointer_arg}}, + [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg, + parse_integer_arg, parse_pointer_arg}}, [__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg, parse_integer_arg}}, [__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}}, diff --git a/libos/src/meson.build b/libos/src/meson.build index b9946bc2af..9c87ab3083 100644 --- a/libos/src/meson.build +++ b/libos/src/meson.build @@ -43,6 +43,7 @@ libos_sources = files( 'fs/sys/cpu_info.c', 'fs/sys/fs.c', 'fs/sys/node_info.c', + 'fs/timerfd/fs.c', 'fs/tmpfs/fs.c', 'gramine_hash.c', 'ipc/libos_ipc.c', @@ -101,6 +102,7 @@ libos_sources = files( 'sys/libos_socket.c', 'sys/libos_stat.c', 'sys/libos_time.c', + 'sys/libos_timerfd.c', 'sys/libos_uname.c', 'sys/libos_wait.c', 'sys/libos_wrappers.c', diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c index 0ccecfce25..a99b2adc8d 100644 --- a/libos/src/sys/libos_alarm.c +++ b/libos/src/sys/libos_alarm.c @@ -35,7 +35,8 @@ static void signal_alarm(IDTYPE caller, void* arg) { long libos_syscall_alarm(unsigned int seconds) { uint64_t usecs = 1000000ULL * seconds; - int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + usecs, /*absolute_time=*/false, &signal_alarm, NULL); if (ret < 0) return ret; @@ -66,8 +67,9 @@ static void signal_itimer(IDTYPE caller, void* arg) { spinlock_unlock(&g_real_itimer_lock); if (next_reset) { - int64_t ret = install_async_event(/*object=*/NULL, next_reset, &signal_itimer, - /*arg=*/NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_reset, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (ret < 0) { log_error( "failed to re-enqueue the next timer event initially set up by 'setitimer()': %s", @@ -113,8 +115,9 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value, : 0; uint64_t current_reset = g_real_itimer.reset; - int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, /*arg=*/NULL); - + int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_value, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (install_ret < 0) { spinlock_unlock(&g_real_itimer_lock); return install_ret; diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c index b3e7a058fa..108c17caa0 100644 --- a/libos/src/sys/libos_epoll.c +++ b/libos/src/sys/libos_epoll.c @@ -189,6 +189,10 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool needs_et = true; } break; + case TYPE_TIMERFD: + needs_et = true; + if (!in) + __atomic_store_n(&handle->needs_et_poll_in, true, __ATOMIC_RELEASE); default: /* Type unsupported with EPOLLET. */ break; @@ -461,6 +465,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event case TYPE_PIPE: case TYPE_SOCK: case TYPE_EVENTFD: + case TYPE_TIMERFD: break; default: /* epoll not supported by this type of handle */ diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c index df21ec1168..2a34ef1062 100644 --- a/libos/src/sys/libos_exit.c +++ b/libos/src/sys/libos_exit.c @@ -108,7 +108,9 @@ noreturn void thread_exit(int error_code, int term_signal) { cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */ /* We pass this ownership to `cleanup_thread`. */ get_thread(cur_thread); - int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + /*time_us=*/0, /*absolute_time=*/false, &cleanup_thread, + cur_thread); /* Take the reference to the current thread from the tcb. */ lock(&cur_thread->lock); diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c index 89d5424da9..6974fef8da 100644 --- a/libos/src/sys/libos_ioctl.c +++ b/libos/src/sys/libos_ioctl.c @@ -104,7 +104,8 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { rwlock_write_unlock(&handle_map->lock); break; case FIOASYNC: - ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL); + ret = install_async_event(ASYNC_EVENT_TYPE_IO, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, &signal_io, NULL); break; case FIONREAD: { if (!is_user_memory_writable((void*)arg, sizeof(int))) { diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c new file mode 100644 index 0000000000..483c74694e --- /dev/null +++ b/libos/src/sys/libos_timerfd.c @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Implementation of "timerfd" system calls. + * + * The timerfd object is created inside Gramine, and all operations are resolved entirely inside + * Gramine. Each timerfd object is associated with a dummy eventfd created on the host. This is + * purely for triggering read notifications (e.g., in epoll); timerfd data is verified inside + * Gramine and is never exposed to the host. Since the host is used purely for notifications, a + * malicious host can only induce Denial of Service (DoS) attacks. + * + * The emulation is currently implemented at the level of a single process. The emulation *may* work + * for multi-process applications, e.g., if the child process inherits the timerfd object but + * doesn't use it. However, all timerfds created in the parent process are marked as invalid in + * child processes, i.e. inter-process timing signals via timerfds is not allowed. + * + * The host's timerfd object is "dummy" and used purely for notifications -- to unblock blocking + * read/select/poll/epoll system calls. The read notify logic is already hardened, by + * double-checking that the object was indeed updated. However, there are three possible attacks on + * polling mechanisms (select/poll/epoll): + * + * a. Malicious host may inject the notification too early: POLLIN when no timer expired yet. This + * may lead to a synchronization failure of the app. To prevent this, timerfd implements a + * callback `post_poll()` where it verifies that a timer was indeed expired (i.e., that the + * notification is not spurious). + * b. Malicious host may inject the notification too late or not send a notification at all. + * This is a Denial of Service (DoS), which we don't care about. + * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL, POLLOUT. This is impossible + * as we control timerfd objects inside the LibOS, and we never raise such conditions. So the + * callback `post_poll()` panics if it detects such a return event. + */ + +#include "libos_checkpoint.h" +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_table.h" +#include "libos_utils.h" +#include "linux_abi/fs.h" +#include "linux_abi/time.h" +#include "linux_eventfd.h" +#include "pal.h" +#include "toml_utils.h" + +static void timerfd_dummy_host_write(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 1; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == -PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) { + int ret; + + PAL_HANDLE hdl = NULL; + + ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0, + PAL_CREATE_IGNORED, /*options=*/0, &hdl); + if (ret < 0) { + log_error("timerfd: dummy host eventfd creation failure"); + return pal_to_unix_errno(ret); + } + + *out_pal_handle = hdl; + return 0; +} + +long libos_syscall_timerfd_create(int clockid, int flags) { + int ret; + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && + clockid != CLOCK_BOOTTIME_ALARM)) + return -EINVAL; + + if (clockid != CLOCK_REALTIME) { + if (FIRST_TIME()) { + log_warning("Unsupported clockid in 'timerfd_create()'; replaced by the system-wide " + "real-time clock."); + } + } + + struct libos_handle* hdl = get_new_handle(); + if (!hdl) + return -ENOMEM; + + hdl->type = TYPE_TIMERFD; + hdl->fs = &timerfd_builtin_fs; + hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0); + hdl->acc_mode = MAY_READ; + hdl->info.timerfd.broken_in_child = false; + + ret = create_timerfd_pal_handle(&hdl->pal_handle); + if (ret < 0) + goto out; + + ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL); +out: + put_handle(hdl); + return ret; +} + +static void timerfd_update(struct libos_handle* hdl) { + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + die_or_inf_loop(); + } + + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + /* When the expiration count overflows, the read will saturate at UINT64_MAX while the timer + * will continue to fire. */ + if (hdl->info.timerfd.num_expirations < UINT64_MAX) { + hdl->info.timerfd.num_expirations++; + hdl->info.timerfd.dummy_host_val++; + + /* perform a write (not supposed to block) to send an event to reading/polling threads */ + timerfd_dummy_host_write(hdl); + } + + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + + maybe_epoll_et_trigger(hdl, /*ret=*/0, /*in=*/false, /*unused was_partial=*/false); +} + +static void callback_itimer(IDTYPE caller, void* arg) { + __UNUSED(caller); + + struct libos_handle* hdl = (struct libos_handle*)arg; + + spinlock_lock(&hdl->info.timerfd.timer_lock); + hdl->info.timerfd.timeout += hdl->info.timerfd.reset; + uint64_t next_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_reset) { + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + next_reset, /*absolute_time=*/false, + &callback_itimer, (void*)hdl); + if (ret < 0) { + log_error( + "failed to re-enqueue the next timer event initially set up by " + "'timerfd_settime()': %s", unix_strerror(ret)); + die_or_inf_loop(); + } + } + + timerfd_update(hdl); +} + +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (!is_user_memory_readable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) { + ret = -EFAULT; + goto out; + } + + if (flags & ~TFD_SETTIME_FLAGS) { + ret = -EINVAL; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + uint64_t next_value = timespec_to_us(&value->it_value); + uint64_t next_reset = timespec_to_us(&value->it_interval); + + spinlock_lock(&hdl->info.timerfd.timer_lock); + + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + + bool absolute_time = flags & TFD_TIMER_ABSTIME; + if (absolute_time) { + hdl->info.timerfd.timeout = next_value; + } else { + hdl->info.timerfd.timeout = setup_time + next_value; + } + hdl->info.timerfd.reset = next_reset; + + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_value) { + int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + next_value, absolute_time, + &callback_itimer, (void*)hdl); + if (install_ret < 0) { + ret = install_ret; + goto out; + } + } + + if (ovalue) { + ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S; + ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S; + ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + } + + ret = 0; +out: + put_handle(hdl); + return ret; +} + +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (!is_user_memory_writable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + spinlock_lock(&hdl->info.timerfd.timer_lock); + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + value->it_interval.tv_sec = current_reset / TIME_US_IN_S; + value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + value->it_value.tv_sec = current_timeout / TIME_US_IN_S; + value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + + ret = 0; +out: + put_handle(hdl); + return ret; +} diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index 07ba977792..d1d845368b 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -2435,8 +2435,12 @@ skip = yes [timer_settime*] skip = yes -# no timerfd -[timerfd*] +# clocks other than `CLOCK_REALTIME` are not supported +[timerfd04] +skip = yes + +# relies on "/proc/sys/kernel/tainted" (see tst_taint.c:tst_taint_check) +[timerfd_settime02] skip = yes [times03] diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index 11c1f0156a..7a4745077f 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -151,6 +151,8 @@ tests = { 'tcp_einprogress': {}, 'tcp_ipv6_v6only': {}, 'tcp_msg_peek': {}, + 'timerfd': {}, + 'timerfd_fork': {}, 'udp': {}, 'uid_gid': {}, 'unix': {}, diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index 3f16eae33e..7daf005ee8 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -1017,6 +1017,18 @@ def test_150_itimer(self): stdout, _ = self.run_binary(['itimer']) self.assertIn("TEST OK", stdout) + def test_160_timerfd(self): + stdout, _ = self.run_binary(['timerfd'], timeout=120) + self.assertIn("TEST OK", stdout) + + def test_161_timerfd_fork(self): + try: + self.run_binary(['timerfd_fork']) + self.fail('timerfd_fork unexpectedly succeeded') + except subprocess.CalledProcessError as e: + stdout = e.stdout.decode() + self.assertIn('child died', stdout) + class TC_31_Syscall(RegressionTestCase): def test_000_syscall_redirect(self): stdout, _ = self.run_binary(['syscall']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 1ed4da79f4..a1b1170f5a 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -129,6 +129,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index d8cbfac653..d430bdbc92 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -130,6 +130,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c new file mode 100644 index 0000000000..842930ded0 --- /dev/null +++ b/libos/test/regression/timerfd.c @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/selecting + * on timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple + * threads. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define EXPECTED_EXPIRATIONS 1 +#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5 +#define NUM_FDS 2 +#define NUM_THREADS 5 +#define PERIODIC_INTERVAL 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd_relative(int fd, bool periodic) { + struct itimerspec new_value = { + .it_value.tv_sec = TIMEOUT_VALUE, + .it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0, + }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) { + for (int i = 0; i < NUM_FDS; i++) + set_timerfd_relative(fds[i], periodic); +} + +static void set_timerfd_absolute(int fd, struct timespec* abs_time) { + struct itimerspec new_value; + + /* Set the timer to expire at the absolute time specified */ + new_value.it_value.tv_sec = abs_time->tv_sec; + new_value.it_value.tv_nsec = abs_time->tv_nsec; + new_value.it_interval.tv_sec = 0; + new_value.it_interval.tv_nsec = 0; + + /* Set the timer to absolute time */ + CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL)); +} + +static void create_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0)); +} + +static void close_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + CHECK(close(fds[i])); +} + +static void test_select(int fds[NUM_FDS]) { + fd_set rfds; + FD_ZERO(&rfds); + for (int i = 0; i < NUM_FDS; i++) { + FD_SET(fds[i], &rfds); + } + + int max_fd = MAX(fds[0], fds[1]) + 1; + CHECK(select(max_fd, &rfds, NULL, NULL, NULL)); + + for (int i = 0; i < NUM_FDS; i++) { + if (FD_ISSET(fds[i], &rfds)) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "select: unexpected number of expirations (expected 1, got %lu)", + expirations); + } + } +} + +static void test_poll(int fds[NUM_FDS]) { + struct pollfd pfds[NUM_FDS]; + for (int i = 0; i < NUM_FDS; i++) { + pfds[i].fd = fds[i]; + pfds[i].events = POLLIN; + } + + CHECK(poll(pfds, NUM_FDS, -1)); + + for (int i = 0; i < NUM_FDS; i++) { + if (pfds[i].revents & POLLIN) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "poll: unexpected number of expirations (expected 1, got %lu)", + expirations); + } + } +} + +static void test_epoll(int fds[NUM_FDS]) { + int epfd = CHECK(epoll_create1(0)); + + struct epoll_event ev; + ev.events = EPOLLIN; + for (int i = 0; i < NUM_FDS; i++) { + ev.data.fd = fds[i]; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev)); + } + + struct epoll_event events[NUM_FDS]; + int nfds = CHECK(epoll_wait(epfd, events, NUM_FDS, -1)); + + for (int n = 0; n < nfds; ++n) { + uint64_t expirations; + CHECK(read(events[n].data.fd, &expirations, sizeof(expirations))); + if (expirations != 1) + errx(1, "epoll: unexpected number of expirations (expected 1, got %lu)", expirations); + } + + close(epfd); +} + +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +static size_t expiration_count = 0; + +static void* timerfd_read_thread_periodic_timer(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + + for (;;) { + CHECK(read(fd, &expirations, sizeof(expirations))); + pthread_mutex_lock(&mutex); + expiration_count += expirations; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + } + + return NULL; +} + +static void test_periodic_timer(int fd) { + pthread_t thread; + CHECK(pthread_create(&thread, NULL, timerfd_read_thread_periodic_timer, &fd)); + + /* wait for at least 5 expirations */ + pthread_mutex_lock(&mutex); + while (expiration_count < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) { + pthread_cond_wait(&cond, &mutex); + } + pthread_mutex_unlock(&mutex); + + if (expiration_count != EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) + errx(1, "periodic_timer: unexpected number of expirations (expected 5, got %lu)", + expiration_count); + + /* cleanup: cancel the read thread and wait for it to exit */ + CHECK(pthread_cancel(thread)); + CHECK(pthread_join(thread, NULL)); +} + +static void* timerfd_read_thread(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations == 0) + err(1, "threaded read: unexpected number of expirations"); + pthread_exit(NULL); +} + +static void test_threaded_read(int fd) { + pthread_t threads[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd)); + /* wait for the thread to finish */ + CHECK(pthread_join(threads[i], NULL)); + } +} + +static void test_timerfd_gettime(int fd) { + struct itimerspec curr_value; + CHECK(timerfd_gettime(fd, &curr_value)); + + /* the timer should be set to expire close to 2 seconds */ + if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 || + curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) { + errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)", + curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec); + } +} + +static void test_absolute_time(int fd) { + struct timespec now; + struct timespec abs_time; + uint64_t expirations; + + /* test timerfd with absolute time set in the future */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + + expirations = 0; + memset(&now, 0, sizeof(struct timespec)); + memset(&abs_time, 0, sizeof(struct timespec)); + + /* test timerfd with absolute time set in the past */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } +} + +static void test_read(int fd, bool non_blocking) { + if (non_blocking) { + CHECK(fcntl(fd, F_SETFL, O_NONBLOCK)); + } + + uint64_t expirations; + int retval = read(fd, &expirations, sizeof(expirations)); + + if (non_blocking) { + if (retval != -1 || errno != EAGAIN) { + errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN", + retval, errno); + } + } else { + CHECK(retval); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "read: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + } +} + +int main(void) { + int fds[NUM_FDS]; + create_timerfds(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_select(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_poll(fds); + + set_timerfds_relative(fds, /*periodic*/false); + test_epoll(fds); + + set_timerfd_relative(fds[0], /*periodic*/true); + test_periodic_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/true); + test_threaded_read(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/false); + test_timerfd_gettime(fds[0]); + + set_timerfd_relative(fds[0], /*periodic*/false); + test_read(fds[0], /*non_blocking=*/false); + test_read(fds[0], /*non_blocking=*/true); + + test_absolute_time(fds[1]); + + close_timerfds(fds); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c new file mode 100644 index 0000000000..daa3874646 --- /dev/null +++ b/libos/test/regression/timerfd_fork.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Multi-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * Note that timerfd is currently only emulated in a secure single-process mode, so this test does + * not work. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd(int fd) { + struct itimerspec new_value = { .it_value.tv_sec = TIMEOUT_VALUE }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void test_multi_process(int fd) { + pid_t pid = CHECK(fork()); + if (pid == 0) { + uint64_t expirations; + /* child: wait on a blocking read for the timer to expire */ + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "child process: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + exit(0); + } else { + int status = 0; + + /* parent: do nothing and let the child process read the timerfd */ + /* wait for the child process to exit */ + CHECK(waitpid(pid, &status, 0)); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + errx(1, "child died with status: %#x", status); + } + } +} + +int main(void) { + int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0)); + + set_timerfd(fd); + test_multi_process(fd); + + CHECK(close(fd)); + + puts("TEST OK"); + return 0; +}