diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md
index 0d1434f8ef..fe61ea8367 100644
--- a/Documentation/devel/features.md
+++ b/Documentation/devel/features.md
@@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux
- ☒ `signalfd()`
[7](#signals-and-process-state-changes)
-- ☒ `timerfd_create()`
+- ▣ `timerfd_create()`
[20](#sleeps-timers-and-alarms)
- ▣ `eventfd()`
@@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux
- ▣ `fallocate()`
[9a](#file-system-operations)
-- ☒ `timerfd_settime()`
+- ▣ `timerfd_settime()`
[20](#sleeps-timers-and-alarms)
-- ☒ `timerfd_gettime()`
+- ▣ `timerfd_gettime()`
[20](#sleeps-timers-and-alarms)
- ☑ `accept4()`
@@ -2871,9 +2871,20 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se
Gramine implements alarm clocks via `alarm()`.
+Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()`
+and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are
+resolved entirely inside Gramine. Each timerfd object is associated with a dummy eventfd created on
+the host. This is purely for triggering read notifications (e.g., in epoll); timerfd data is
+verified inside Gramine and is never exposed to the host. Since the host is used purely for
+notifications, a malicious host can only induce Denial of Service (DoS) attacks.
+
+The emulation is currently implemented at the level of a single process. The emulation *may* work for
+multi-process applications, e.g., if the child process inherits the timerfd object but doesn't use
+it. However, all timerfds created in the parent process are marked as invalid in child processes,
+i.e. inter-process timing signals via timerfds is not allowed.
+
Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine
-also does not currently implement timers that notify via file descriptors. Gramine could implement
-these timers in the future, if need arises.
+could implement it in the future, if need arises.
Related system calls
@@ -2889,9 +2900,9 @@ these timers in the future, if need arises.
- ☒ `timer_getoverrun()`: may be implemented in the future
- ☒ `timer_delete()`: may be implemented in the future
-- ☒ `timerfd_create()`: may be implemented in the future
-- ☒ `timerfd_settime()`: may be implemented in the future
-- ☒ `timerfd_gettime()`: may be implemented in the future
+- ▣ `timerfd_create()`: see notes above
+- ▣ `timerfd_settime()`: see notes above
+- ▣ `timerfd_gettime()`: see notes above
diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h
index 4018c7c948..875225c50a 100644
--- a/libos/include/libos_fs.h
+++ b/libos/include/libos_fs.h
@@ -183,7 +183,7 @@ struct libos_fs_ops {
int (*poll)(struct libos_handle* hdl, int in_events, int* out_events);
/* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed
- * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */
+ * ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */
void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events);
/* checkpoint/migrate the file system */
@@ -942,6 +942,7 @@ extern struct libos_fs eventfd_builtin_fs;
extern struct libos_fs synthetic_builtin_fs;
extern struct libos_fs path_builtin_fs;
extern struct libos_fs shm_builtin_fs;
+extern struct libos_fs timerfd_builtin_fs;
struct libos_fs* find_fs(const char* name);
diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h
index 4ce281f4e0..615950a486 100644
--- a/libos/include/libos_handle.h
+++ b/libos/include/libos_handle.h
@@ -46,6 +46,7 @@ enum libos_handle_type {
/* Special handles: */
TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */
TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */
+ TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */
};
struct libos_pipe_handle {
@@ -142,6 +143,18 @@ struct libos_eventfd_handle {
uint64_t dummy_host_val;
};
+struct libos_timerfd_handle {
+ bool broken_in_child;
+
+ spinlock_t expiration_lock; /* protecting below fields */
+ uint64_t num_expirations;
+ uint64_t dummy_host_val;
+
+ spinlock_t timer_lock; /* protecting below fields */
+ uint64_t timeout;
+ uint64_t reset;
+};
+
struct libos_handle {
enum libos_handle_type type;
bool is_dir;
@@ -217,6 +230,8 @@ struct libos_handle {
struct libos_epoll_handle epoll; /* TYPE_EPOLL */
struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */
+
+ struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */
} info;
struct libos_dir_handle dir_info;
@@ -232,7 +247,7 @@ struct libos_handle {
* `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and
* `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and
* maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those
- * handle types that are seekable (e.g. not on eventfds or pipes). */
+ * handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */
struct libos_lock pos_lock;
};
diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h
index 5e110aca58..4ee6df6fa4 100644
--- a/libos/include/libos_table.h
+++ b/libos/include/libos_table.h
@@ -207,3 +207,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags);
long libos_syscall_mlock2(unsigned long start, size_t len, int flags);
long libos_syscall_sysinfo(struct sysinfo* info);
long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags);
+long libos_syscall_timerfd_create(int clockid, int flags);
+long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value,
+ struct __kernel_itimerspec* ovalue);
+long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value);
diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h
index e3f4cb2e1f..f408d9bf35 100644
--- a/libos/include/libos_utils.h
+++ b/libos/include/libos_utils.h
@@ -52,8 +52,14 @@ void clean_link_map_list(void);
int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name);
/* Asynchronous event support */
+enum async_event_type {
+ ASYNC_EVENT_TYPE_IO = 1,
+ ASYNC_EVENT_TYPE_ALARM_TIMER = 2,
+};
+
int init_async_worker(void);
-int64_t install_async_event(PAL_HANDLE object, unsigned long time,
+int64_t install_async_event(enum async_event_type type, PAL_HANDLE object,
+ unsigned long time_us, bool absolute_time,
void (*callback)(IDTYPE caller, void* arg), void* arg);
struct libos_thread* terminate_async_worker(void);
diff --git a/libos/include/linux_abi/time.h b/libos/include/linux_abi/time.h
index da848822de..f0e1f6d76d 100644
--- a/libos/include/linux_abi/time.h
+++ b/libos/include/linux_abi/time.h
@@ -37,3 +37,14 @@ struct __kernel_timezone {
int tz_minuteswest; /* minutes west of Greenwich */
int tz_dsttime; /* type of dst correction */
};
+
+#define TFD_TIMER_ABSTIME (1 << 0)
+#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
+#define TFD_CLOEXEC O_CLOEXEC
+#define TFD_NONBLOCK O_NONBLOCK
+
+#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
+/* Flags for timerfd_create. */
+#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
+/* Flags for timerfd_settime. */
+#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c
index 9629423899..0e28c50b5c 100644
--- a/libos/src/arch/x86_64/libos_table.c
+++ b/libos/src/arch/x86_64/libos_table.c
@@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = {
[__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat
[__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait,
[__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd
- [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create
+ [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create,
[__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd,
[__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate,
- [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime
- [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime
+ [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime,
+ [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime,
[__NR_accept4] = (libos_syscall_t)libos_syscall_accept4,
[__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4
[__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2,
diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c
index 5a29a36d6d..f10aefd74b 100644
--- a/libos/src/fs/libos_fs.c
+++ b/libos/src/fs/libos_fs.c
@@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = {
&synthetic_builtin_fs,
&path_builtin_fs,
&shm_builtin_fs,
+ &timerfd_builtin_fs,
};
static struct libos_lock g_mount_mgr_lock;
diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c
index c3da147c48..ed1fa1a95a 100644
--- a/libos/src/fs/proc/thread.c
+++ b/libos/src/fs/proc/thread.c
@@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) {
case TYPE_EPOLL: str = "epoll:[?]"; break;
case TYPE_EVENTFD: str = "eventfd:[?]"; break;
case TYPE_SHM: str = "shm:[?]"; break;
+ case TYPE_TIMERFD: str = "timerfd:[?]"; break;
default: str = "unknown:[?]"; break;
}
return strdup(str);
diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c
new file mode 100644
index 0000000000..dfa74b9e58
--- /dev/null
+++ b/libos/src/fs/timerfd/fs.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/*
+ * This file contains code for implementation of "timerfd" filesystem. For more information, see
+ * `libos/src/sys/libos_timerfd.c`.
+ */
+
+#include "libos_fs.h"
+#include "libos_handle.h"
+#include "libos_internal.h"
+#include "libos_lock.h"
+#include "linux_abi/errors.h"
+#include "pal.h"
+
+/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in
+ * child processes, i.e. inter-process timing signals via timerfds is not allowed. This restriction
+ * is because LibOS doesn't yet implement sync between timerfd objects. */
+static int timerfd_checkin(struct libos_handle* hdl) {
+ assert(hdl->type == TYPE_TIMERFD);
+ hdl->info.timerfd.broken_in_child = true;
+ return 0;
+}
+
+static void timerfd_dummy_host_read(struct libos_handle* hdl) {
+ int ret;
+ uint64_t buf_dummy_host_val = 0;
+ size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+ do {
+ ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+ &buf_dummy_host_val);
+ } while (ret == -PAL_ERROR_INTERRUPTED);
+ if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+ /* must not happen in benign case, consider it an attack and panic */
+ BUG();
+ }
+}
+
+static void timerfd_dummy_host_wait(struct libos_handle* hdl) {
+ pal_wait_flags_t wait_for_events = PAL_WAIT_READ;
+ pal_wait_flags_t ret_events = 0;
+ int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL);
+ if (ret < 0 && ret != -PAL_ERROR_INTERRUPTED) {
+ BUG();
+ }
+ (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */
+}
+
+static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) {
+ __UNUSED(pos);
+ assert(hdl->type == TYPE_TIMERFD);
+
+ if (count < sizeof(uint64_t))
+ return -EINVAL;
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ return -EIO;
+ }
+
+ int ret;
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+
+ while (!hdl->info.timerfd.num_expirations) {
+ if (hdl->flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+ timerfd_dummy_host_wait(hdl);
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+ }
+
+ memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t));
+ hdl->info.timerfd.num_expirations = 0;
+
+ /* perform a read (not supposed to block) to clear the event from polling threads and to send an
+ * event to writing threads */
+ if (hdl->info.timerfd.dummy_host_val) {
+ timerfd_dummy_host_read(hdl);
+ hdl->info.timerfd.dummy_host_val = 0;
+ }
+
+ ret = (ssize_t)count;
+out:
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+ maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false);
+ return ret;
+}
+
+static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) {
+ assert(hdl->type == TYPE_TIMERFD);
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ *pal_ret_events = PAL_WAIT_ERROR;
+ return;
+ }
+
+ if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) {
+ /* impossible: we control timerfd inside the LibOS, and we never raise such conditions */
+ BUG();
+ }
+
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+ if (*pal_ret_events & PAL_WAIT_READ) {
+ /* there is data to read: verify if timerfd has number of expirations greater than zero */
+ if (!hdl->info.timerfd.num_expirations) {
+ /* spurious or malicious notification, can legitimately happen if another thread
+ * consumed this event between this thread's poll wakeup and the post_poll callback;
+ * we currently choose to return a spurious notification to the user */
+ *pal_ret_events &= ~PAL_WAIT_READ;
+ }
+ }
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+}
+
+struct libos_fs_ops timerfd_fs_ops = {
+ .checkin = &timerfd_checkin,
+ .read = &timerfd_read,
+ .post_poll = &timerfd_post_poll,
+};
+
+struct libos_fs timerfd_builtin_fs = {
+ .name = "timerfd",
+ .fs_ops = &timerfd_fs_ops,
+};
diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c
index b1c0b66db2..988f45364a 100644
--- a/libos/src/libos_async.c
+++ b/libos/src/libos_async.c
@@ -18,12 +18,13 @@
DEFINE_LIST(async_event);
struct async_event {
+ enum async_event_type type;
IDTYPE caller; /* thread installing this event */
LIST_TYPE(async_event) list;
LIST_TYPE(async_event) triggered_list;
void (*callback)(IDTYPE caller, void* arg);
void* arg;
- PAL_HANDLE object; /* handle (async IO) to wait on */
+ PAL_HANDLE object; /* handle (async IO or timerfd) to wait on */
uint64_t expire_time_us; /* alarm/timer to wait on */
};
DEFINE_LISTP(async_event);
@@ -40,25 +41,29 @@ static struct libos_pollable_event install_new_event;
static int create_async_worker(void);
-/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC)
- * using this function. These events are enqueued in async_list and delivered
- * to async worker thread by triggering install_new_event. When event is
- * triggered in async worker thread, the corresponding event's callback with
- * arguments `arg` is called. This callback typically sends a signal to the
+/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using
+ * this function. These events are enqueued in async_list and delivered to async worker thread by
+ * triggering install_new_event. When event is triggered in async worker thread, the corresponding
+ * event's callback with arguments `arg` is called. This callback typically sends a signal to the
* thread which registered the event (saved in `event->caller`).
*
* We distinguish between alarm/timer events and async IO events:
+ * - alarm/timer events set time_us = microsseconds (time_us = 0 cancels all pending
+ * alarms/timers).
+ * Specfically when object != NULL and time_us != 0, this indicates a timerfd event.
* - alarm/timer events set object = NULL and time_us = microseconds
* (time_us = 0 cancels all pending alarms/timers).
* - async IO events set object = handle and time_us = 0.
*
- * Function returns remaining usecs for alarm/timer events (same as alarm())
- * or 0 for async IO events. On error, it returns a negated error code.
+ * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO
+ * events. On error, it returns a negated error code.
*/
-int64_t install_async_event(PAL_HANDLE object, uint64_t time_us,
+int64_t install_async_event(enum async_event_type type, PAL_HANDLE object,
+ uint64_t time_us, bool absolute_time,
void (*callback)(IDTYPE caller, void* arg), void* arg) {
- /* if event happens on object, time_us must be zero */
- assert(!object || (object && !time_us));
+ /* if event happens on async IO object, time_us must be zero */
+ assert(type == ASYNC_EVENT_TYPE_ALARM_TIMER ||
+ (type == ASYNC_EVENT_TYPE_IO && object && !time_us));
uint64_t now_us = 0;
int ret = PalSystemTimeQuery(&now_us);
@@ -73,21 +78,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time_us,
return -ENOMEM;
}
+ event->type = type;
event->callback = callback;
event->arg = arg;
event->caller = get_cur_tid();
event->object = object;
- event->expire_time_us = time_us ? now_us + time_us : 0;
+ event->expire_time_us = time_us ? (absolute_time ? time_us : now_us + time_us) : 0;
lock(&async_worker_lock);
- if (callback != &cleanup_thread && !object) {
- /* This is alarm() or setitimer() emulation, treat both according to
- * alarm() syscall semantics: cancel any pending alarm/timer. */
+ if (callback != &cleanup_thread && type == ASYNC_EVENT_TYPE_ALARM_TIMER) {
+ /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm()
+ * syscall semantics: cancel any pending alarm/timer. */
struct async_event* tmp;
struct async_event* n;
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
- if (tmp->expire_time_us) {
+ if (tmp->object == object && tmp->expire_time_us) {
/* this is a pending alarm/timer, cancel it and save its expiration time */
if (max_prev_expire_time_us < tmp->expire_time_us)
max_prev_expire_time_us = tmp->expire_time_us;
@@ -208,7 +214,7 @@ static int libos_async_worker(void* arg) {
bool other_event = false;
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
/* repopulate `pals` with IO events and find the next expiring alarm/timer */
- if (tmp->object) {
+ if (tmp->type == ASYNC_EVENT_TYPE_IO) {
if (pals_cnt == pals_max_cnt) {
/* grow `pals` to accommodate more objects */
PAL_HANDLE* tmp_pals = malloc(sizeof(*tmp_pals) * (1 + pals_max_cnt * 2));
@@ -244,7 +250,8 @@ static int libos_async_worker(void* arg) {
pal_events[pals_cnt + 1] = PAL_WAIT_READ;
ret_events[pals_cnt + 1] = 0;
pals_cnt++;
- } else if (tmp->expire_time_us && tmp->expire_time_us > now_us) {
+ } else if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER && tmp->expire_time_us &&
+ tmp->expire_time_us > now_us) {
if (!next_expire_time_us || next_expire_time_us > tmp->expire_time_us) {
/* use time of the next expiring alarm/timer */
next_expire_time_us = tmp->expire_time_us;
@@ -312,7 +319,7 @@ static int libos_async_worker(void* arg) {
/* check if this event is an IO event found in async_list */
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
- if (tmp->object == pals[i]) {
+ if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object == pals[i]) {
log_debug("Async IO event triggered at %lu", now_us);
LISTP_ADD_TAIL(tmp, &triggered, triggered_list);
break;
@@ -342,7 +349,7 @@ static int libos_async_worker(void* arg) {
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &triggered, triggered_list) {
LISTP_DEL(tmp, &triggered, triggered_list);
tmp->callback(tmp->caller, tmp->arg);
- if (!tmp->object) {
+ if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER) {
/* this is a one-off exit-child or alarm/timer event */
free(tmp);
}
diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c
index 883c07f2f0..37f906bc57 100644
--- a/libos/src/libos_parser.c
+++ b/libos/src/libos_parser.c
@@ -516,13 +516,17 @@ struct parser_table {
parse_integer_arg, parse_pointer_arg, parse_integer_arg,
parse_integer_arg, parse_pointer_arg, parse_pointer_arg}},
[__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}},
- [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}},
+ [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg,
+ parse_integer_arg, parse_integer_arg}},
[__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg,
parse_integer_arg}},
[__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg,
parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}},
- [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}},
- [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}},
+ [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg,
+ parse_integer_arg, parse_integer_arg, parse_pointer_arg,
+ parse_pointer_arg}},
+ [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg,
+ parse_integer_arg, parse_pointer_arg}},
[__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg,
parse_pointer_arg, parse_pointer_arg, parse_integer_arg}},
[__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}},
diff --git a/libos/src/meson.build b/libos/src/meson.build
index b9946bc2af..9c87ab3083 100644
--- a/libos/src/meson.build
+++ b/libos/src/meson.build
@@ -43,6 +43,7 @@ libos_sources = files(
'fs/sys/cpu_info.c',
'fs/sys/fs.c',
'fs/sys/node_info.c',
+ 'fs/timerfd/fs.c',
'fs/tmpfs/fs.c',
'gramine_hash.c',
'ipc/libos_ipc.c',
@@ -101,6 +102,7 @@ libos_sources = files(
'sys/libos_socket.c',
'sys/libos_stat.c',
'sys/libos_time.c',
+ 'sys/libos_timerfd.c',
'sys/libos_uname.c',
'sys/libos_wait.c',
'sys/libos_wrappers.c',
diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c
index 0ccecfce25..a99b2adc8d 100644
--- a/libos/src/sys/libos_alarm.c
+++ b/libos/src/sys/libos_alarm.c
@@ -35,7 +35,8 @@ static void signal_alarm(IDTYPE caller, void* arg) {
long libos_syscall_alarm(unsigned int seconds) {
uint64_t usecs = 1000000ULL * seconds;
- int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ usecs, /*absolute_time=*/false, &signal_alarm, NULL);
if (ret < 0)
return ret;
@@ -66,8 +67,9 @@ static void signal_itimer(IDTYPE caller, void* arg) {
spinlock_unlock(&g_real_itimer_lock);
if (next_reset) {
- int64_t ret = install_async_event(/*object=*/NULL, next_reset, &signal_itimer,
- /*arg=*/NULL);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ next_reset, /*absolute_time=*/false,
+ &signal_itimer, /*arg=*/NULL);
if (ret < 0) {
log_error(
"failed to re-enqueue the next timer event initially set up by 'setitimer()': %s",
@@ -113,8 +115,9 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value,
: 0;
uint64_t current_reset = g_real_itimer.reset;
- int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, /*arg=*/NULL);
-
+ int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ next_value, /*absolute_time=*/false,
+ &signal_itimer, /*arg=*/NULL);
if (install_ret < 0) {
spinlock_unlock(&g_real_itimer_lock);
return install_ret;
diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c
index b3e7a058fa..108c17caa0 100644
--- a/libos/src/sys/libos_epoll.c
+++ b/libos/src/sys/libos_epoll.c
@@ -189,6 +189,10 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool
needs_et = true;
}
break;
+ case TYPE_TIMERFD:
+ needs_et = true;
+ if (!in)
+ __atomic_store_n(&handle->needs_et_poll_in, true, __ATOMIC_RELEASE);
default:
/* Type unsupported with EPOLLET. */
break;
@@ -461,6 +465,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event
case TYPE_PIPE:
case TYPE_SOCK:
case TYPE_EVENTFD:
+ case TYPE_TIMERFD:
break;
default:
/* epoll not supported by this type of handle */
diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c
index df21ec1168..2a34ef1062 100644
--- a/libos/src/sys/libos_exit.c
+++ b/libos/src/sys/libos_exit.c
@@ -108,7 +108,9 @@ noreturn void thread_exit(int error_code, int term_signal) {
cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */
/* We pass this ownership to `cleanup_thread`. */
get_thread(cur_thread);
- int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ /*time_us=*/0, /*absolute_time=*/false, &cleanup_thread,
+ cur_thread);
/* Take the reference to the current thread from the tcb. */
lock(&cur_thread->lock);
diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c
index 89d5424da9..6974fef8da 100644
--- a/libos/src/sys/libos_ioctl.c
+++ b/libos/src/sys/libos_ioctl.c
@@ -104,7 +104,8 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) {
rwlock_write_unlock(&handle_map->lock);
break;
case FIOASYNC:
- ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL);
+ ret = install_async_event(ASYNC_EVENT_TYPE_IO, hdl->pal_handle,
+ /*time_us=*/0, /*absolute_time=*/false, &signal_io, NULL);
break;
case FIONREAD: {
if (!is_user_memory_writable((void*)arg, sizeof(int))) {
diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c
new file mode 100644
index 0000000000..483c74694e
--- /dev/null
+++ b/libos/src/sys/libos_timerfd.c
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/* Implementation of "timerfd" system calls.
+ *
+ * The timerfd object is created inside Gramine, and all operations are resolved entirely inside
+ * Gramine. Each timerfd object is associated with a dummy eventfd created on the host. This is
+ * purely for triggering read notifications (e.g., in epoll); timerfd data is verified inside
+ * Gramine and is never exposed to the host. Since the host is used purely for notifications, a
+ * malicious host can only induce Denial of Service (DoS) attacks.
+ *
+ * The emulation is currently implemented at the level of a single process. The emulation *may* work
+ * for multi-process applications, e.g., if the child process inherits the timerfd object but
+ * doesn't use it. However, all timerfds created in the parent process are marked as invalid in
+ * child processes, i.e. inter-process timing signals via timerfds is not allowed.
+ *
+ * The host's timerfd object is "dummy" and used purely for notifications -- to unblock blocking
+ * read/select/poll/epoll system calls. The read notify logic is already hardened, by
+ * double-checking that the object was indeed updated. However, there are three possible attacks on
+ * polling mechanisms (select/poll/epoll):
+ *
+ * a. Malicious host may inject the notification too early: POLLIN when no timer expired yet. This
+ * may lead to a synchronization failure of the app. To prevent this, timerfd implements a
+ * callback `post_poll()` where it verifies that a timer was indeed expired (i.e., that the
+ * notification is not spurious).
+ * b. Malicious host may inject the notification too late or not send a notification at all.
+ * This is a Denial of Service (DoS), which we don't care about.
+ * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL, POLLOUT. This is impossible
+ * as we control timerfd objects inside the LibOS, and we never raise such conditions. So the
+ * callback `post_poll()` panics if it detects such a return event.
+ */
+
+#include "libos_checkpoint.h"
+#include "libos_fs.h"
+#include "libos_handle.h"
+#include "libos_internal.h"
+#include "libos_table.h"
+#include "libos_utils.h"
+#include "linux_abi/fs.h"
+#include "linux_abi/time.h"
+#include "linux_eventfd.h"
+#include "pal.h"
+#include "toml_utils.h"
+
+static void timerfd_dummy_host_write(struct libos_handle* hdl) {
+ int ret;
+ uint64_t buf_dummy_host_val = 1;
+ size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+ do {
+ ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+ &buf_dummy_host_val);
+ } while (ret == -PAL_ERROR_INTERRUPTED);
+ if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+ /* must not happen in benign case, consider it an attack and panic */
+ BUG();
+ }
+}
+
+static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) {
+ int ret;
+
+ PAL_HANDLE hdl = NULL;
+
+ ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0,
+ PAL_CREATE_IGNORED, /*options=*/0, &hdl);
+ if (ret < 0) {
+ log_error("timerfd: dummy host eventfd creation failure");
+ return pal_to_unix_errno(ret);
+ }
+
+ *out_pal_handle = hdl;
+ return 0;
+}
+
+long libos_syscall_timerfd_create(int clockid, int flags) {
+ int ret;
+
+ if ((flags & ~TFD_CREATE_FLAGS) ||
+ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME &&
+ clockid != CLOCK_BOOTTIME_ALARM))
+ return -EINVAL;
+
+ if (clockid != CLOCK_REALTIME) {
+ if (FIRST_TIME()) {
+ log_warning("Unsupported clockid in 'timerfd_create()'; replaced by the system-wide "
+ "real-time clock.");
+ }
+ }
+
+ struct libos_handle* hdl = get_new_handle();
+ if (!hdl)
+ return -ENOMEM;
+
+ hdl->type = TYPE_TIMERFD;
+ hdl->fs = &timerfd_builtin_fs;
+ hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0);
+ hdl->acc_mode = MAY_READ;
+ hdl->info.timerfd.broken_in_child = false;
+
+ ret = create_timerfd_pal_handle(&hdl->pal_handle);
+ if (ret < 0)
+ goto out;
+
+ ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL);
+out:
+ put_handle(hdl);
+ return ret;
+}
+
+static void timerfd_update(struct libos_handle* hdl) {
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ die_or_inf_loop();
+ }
+
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+
+ /* When the expiration count overflows, the read will saturate at UINT64_MAX while the timer
+ * will continue to fire. */
+ if (hdl->info.timerfd.num_expirations < UINT64_MAX) {
+ hdl->info.timerfd.num_expirations++;
+ hdl->info.timerfd.dummy_host_val++;
+
+ /* perform a write (not supposed to block) to send an event to reading/polling threads */
+ timerfd_dummy_host_write(hdl);
+ }
+
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+
+ maybe_epoll_et_trigger(hdl, /*ret=*/0, /*in=*/false, /*unused was_partial=*/false);
+}
+
+static void callback_itimer(IDTYPE caller, void* arg) {
+ __UNUSED(caller);
+
+ struct libos_handle* hdl = (struct libos_handle*)arg;
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+ hdl->info.timerfd.timeout += hdl->info.timerfd.reset;
+ uint64_t next_reset = hdl->info.timerfd.reset;
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ if (next_reset) {
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ next_reset, /*absolute_time=*/false,
+ &callback_itimer, (void*)hdl);
+ if (ret < 0) {
+ log_error(
+ "failed to re-enqueue the next timer event initially set up by "
+ "'timerfd_settime()': %s", unix_strerror(ret));
+ die_or_inf_loop();
+ }
+ }
+
+ timerfd_update(hdl);
+}
+
+long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value,
+ struct __kernel_itimerspec* ovalue) {
+ int ret;
+
+ struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL);
+ if (!hdl)
+ return -EBADF;
+
+ if (hdl->type != TYPE_TIMERFD) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!is_user_memory_readable(value, sizeof(*value))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (flags & ~TFD_SETTIME_FLAGS) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ uint64_t setup_time = 0;
+ ret = PalSystemTimeQuery(&setup_time);
+ if (ret < 0) {
+ ret = pal_to_unix_errno(ret);
+ goto out;
+ }
+
+ uint64_t next_value = timespec_to_us(&value->it_value);
+ uint64_t next_reset = timespec_to_us(&value->it_interval);
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+
+ uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time
+ ? hdl->info.timerfd.timeout - setup_time
+ : 0;
+ uint64_t current_reset = hdl->info.timerfd.reset;
+
+ bool absolute_time = flags & TFD_TIMER_ABSTIME;
+ if (absolute_time) {
+ hdl->info.timerfd.timeout = next_value;
+ } else {
+ hdl->info.timerfd.timeout = setup_time + next_value;
+ }
+ hdl->info.timerfd.reset = next_reset;
+
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ if (next_value) {
+ int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ next_value, absolute_time,
+ &callback_itimer, (void*)hdl);
+ if (install_ret < 0) {
+ ret = install_ret;
+ goto out;
+ }
+ }
+
+ if (ovalue) {
+ ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S;
+ ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US;
+ ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S;
+ ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US;
+ }
+
+ ret = 0;
+out:
+ put_handle(hdl);
+ return ret;
+}
+
+long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) {
+ int ret;
+
+ struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL);
+ if (!hdl)
+ return -EBADF;
+
+ if (hdl->type != TYPE_TIMERFD) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!is_user_memory_writable(value, sizeof(*value))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ uint64_t setup_time = 0;
+ ret = PalSystemTimeQuery(&setup_time);
+ if (ret < 0) {
+ ret = pal_to_unix_errno(ret);
+ goto out;
+ }
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+ uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time
+ ? hdl->info.timerfd.timeout - setup_time
+ : 0;
+ uint64_t current_reset = hdl->info.timerfd.reset;
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ value->it_interval.tv_sec = current_reset / TIME_US_IN_S;
+ value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US;
+ value->it_value.tv_sec = current_timeout / TIME_US_IN_S;
+ value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US;
+
+ ret = 0;
+out:
+ put_handle(hdl);
+ return ret;
+}
diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg
index 07ba977792..d1d845368b 100644
--- a/libos/test/ltp/ltp.cfg
+++ b/libos/test/ltp/ltp.cfg
@@ -2435,8 +2435,12 @@ skip = yes
[timer_settime*]
skip = yes
-# no timerfd
-[timerfd*]
+# clocks other than `CLOCK_REALTIME` are not supported
+[timerfd04]
+skip = yes
+
+# relies on "/proc/sys/kernel/tainted" (see tst_taint.c:tst_taint_check)
+[timerfd_settime02]
skip = yes
[times03]
diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build
index 11c1f0156a..7a4745077f 100644
--- a/libos/test/regression/meson.build
+++ b/libos/test/regression/meson.build
@@ -151,6 +151,8 @@ tests = {
'tcp_einprogress': {},
'tcp_ipv6_v6only': {},
'tcp_msg_peek': {},
+ 'timerfd': {},
+ 'timerfd_fork': {},
'udp': {},
'uid_gid': {},
'unix': {},
diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py
index 3f16eae33e..7daf005ee8 100644
--- a/libos/test/regression/test_libos.py
+++ b/libos/test/regression/test_libos.py
@@ -1017,6 +1017,18 @@ def test_150_itimer(self):
stdout, _ = self.run_binary(['itimer'])
self.assertIn("TEST OK", stdout)
+ def test_160_timerfd(self):
+ stdout, _ = self.run_binary(['timerfd'], timeout=120)
+ self.assertIn("TEST OK", stdout)
+
+ def test_161_timerfd_fork(self):
+ try:
+ self.run_binary(['timerfd_fork'])
+ self.fail('timerfd_fork unexpectedly succeeded')
+ except subprocess.CalledProcessError as e:
+ stdout = e.stdout.decode()
+ self.assertIn('child died', stdout)
+
class TC_31_Syscall(RegressionTestCase):
def test_000_syscall_redirect(self):
stdout, _ = self.run_binary(['syscall'])
diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml
index 1ed4da79f4..a1b1170f5a 100644
--- a/libos/test/regression/tests.toml
+++ b/libos/test/regression/tests.toml
@@ -129,6 +129,8 @@ manifests = [
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
+ "timerfd",
+ "timerfd_fork",
"toml_parsing",
"udp",
"uid_gid",
diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml
index d8cbfac653..d430bdbc92 100644
--- a/libos/test/regression/tests_musl.toml
+++ b/libos/test/regression/tests_musl.toml
@@ -130,6 +130,8 @@ manifests = [
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
+ "timerfd",
+ "timerfd_fork",
"toml_parsing",
"udp",
"uid_gid",
diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c
new file mode 100644
index 0000000000..842930ded0
--- /dev/null
+++ b/libos/test/regression/timerfd.c
@@ -0,0 +1,300 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/*
+ * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and
+ * `timerfd_gettime()`).
+ *
+ * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/selecting
+ * on timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple
+ * threads.
+ */
+
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define EXPECTED_EXPIRATIONS 1
+#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5
+#define NUM_FDS 2
+#define NUM_THREADS 5
+#define PERIODIC_INTERVAL 1
+#define TIMEOUT_VALUE 2
+
+static void set_timerfd_relative(int fd, bool periodic) {
+ struct itimerspec new_value = {
+ .it_value.tv_sec = TIMEOUT_VALUE,
+ .it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0,
+ };
+
+ CHECK(timerfd_settime(fd, 0, &new_value, NULL));
+}
+
+static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) {
+ for (int i = 0; i < NUM_FDS; i++)
+ set_timerfd_relative(fds[i], periodic);
+}
+
+static void set_timerfd_absolute(int fd, struct timespec* abs_time) {
+ struct itimerspec new_value;
+
+ /* Set the timer to expire at the absolute time specified */
+ new_value.it_value.tv_sec = abs_time->tv_sec;
+ new_value.it_value.tv_nsec = abs_time->tv_nsec;
+ new_value.it_interval.tv_sec = 0;
+ new_value.it_interval.tv_nsec = 0;
+
+ /* Set the timer to absolute time */
+ CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL));
+}
+
+static void create_timerfds(int fds[NUM_FDS]) {
+ for (int i = 0; i < NUM_FDS; i++)
+ fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0));
+}
+
+static void close_timerfds(int fds[NUM_FDS]) {
+ for (int i = 0; i < NUM_FDS; i++)
+ CHECK(close(fds[i]));
+}
+
+static void test_select(int fds[NUM_FDS]) {
+ fd_set rfds;
+ FD_ZERO(&rfds);
+ for (int i = 0; i < NUM_FDS; i++) {
+ FD_SET(fds[i], &rfds);
+ }
+
+ int max_fd = MAX(fds[0], fds[1]) + 1;
+ CHECK(select(max_fd, &rfds, NULL, NULL, NULL));
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ if (FD_ISSET(fds[i], &rfds)) {
+ uint64_t expirations;
+ CHECK(read(fds[i], &expirations, sizeof(expirations)));
+ if (expirations != 1)
+ errx(1, "select: unexpected number of expirations (expected 1, got %lu)",
+ expirations);
+ }
+ }
+}
+
+static void test_poll(int fds[NUM_FDS]) {
+ struct pollfd pfds[NUM_FDS];
+ for (int i = 0; i < NUM_FDS; i++) {
+ pfds[i].fd = fds[i];
+ pfds[i].events = POLLIN;
+ }
+
+ CHECK(poll(pfds, NUM_FDS, -1));
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ if (pfds[i].revents & POLLIN) {
+ uint64_t expirations;
+ CHECK(read(fds[i], &expirations, sizeof(expirations)));
+ if (expirations != 1)
+ errx(1, "poll: unexpected number of expirations (expected 1, got %lu)",
+ expirations);
+ }
+ }
+}
+
+static void test_epoll(int fds[NUM_FDS]) {
+ int epfd = CHECK(epoll_create1(0));
+
+ struct epoll_event ev;
+ ev.events = EPOLLIN;
+ for (int i = 0; i < NUM_FDS; i++) {
+ ev.data.fd = fds[i];
+ CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev));
+ }
+
+ struct epoll_event events[NUM_FDS];
+ int nfds = CHECK(epoll_wait(epfd, events, NUM_FDS, -1));
+
+ for (int n = 0; n < nfds; ++n) {
+ uint64_t expirations;
+ CHECK(read(events[n].data.fd, &expirations, sizeof(expirations)));
+ if (expirations != 1)
+ errx(1, "epoll: unexpected number of expirations (expected 1, got %lu)", expirations);
+ }
+
+ close(epfd);
+}
+
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static size_t expiration_count = 0;
+
+static void* timerfd_read_thread_periodic_timer(void* arg) {
+ int fd = *(int*)arg;
+ uint64_t expirations;
+
+ for (;;) {
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ pthread_mutex_lock(&mutex);
+ expiration_count += expirations;
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&mutex);
+ }
+
+ return NULL;
+}
+
+static void test_periodic_timer(int fd) {
+ pthread_t thread;
+ CHECK(pthread_create(&thread, NULL, timerfd_read_thread_periodic_timer, &fd));
+
+ /* wait for at least 5 expirations */
+ pthread_mutex_lock(&mutex);
+ while (expiration_count < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) {
+ pthread_cond_wait(&cond, &mutex);
+ }
+ pthread_mutex_unlock(&mutex);
+
+ if (expiration_count != EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT)
+ errx(1, "periodic_timer: unexpected number of expirations (expected 5, got %lu)",
+ expiration_count);
+
+ /* cleanup: cancel the read thread and wait for it to exit */
+ CHECK(pthread_cancel(thread));
+ CHECK(pthread_join(thread, NULL));
+}
+
+static void* timerfd_read_thread(void* arg) {
+ int fd = *(int*)arg;
+ uint64_t expirations;
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations == 0)
+ err(1, "threaded read: unexpected number of expirations");
+ pthread_exit(NULL);
+}
+
+static void test_threaded_read(int fd) {
+ pthread_t threads[NUM_THREADS];
+ for (int i = 0; i < NUM_THREADS; i++) {
+ CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd));
+ /* wait for the thread to finish */
+ CHECK(pthread_join(threads[i], NULL));
+ }
+}
+
+static void test_timerfd_gettime(int fd) {
+ struct itimerspec curr_value;
+ CHECK(timerfd_gettime(fd, &curr_value));
+
+ /* the timer should be set to expire close to 2 seconds */
+ if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 ||
+ curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) {
+ errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)",
+ curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec);
+ }
+}
+
+static void test_absolute_time(int fd) {
+ struct timespec now;
+ struct timespec abs_time;
+ uint64_t expirations;
+
+ /* test timerfd with absolute time set in the future */
+ CHECK(clock_gettime(CLOCK_REALTIME, &now));
+ abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE;
+ abs_time.tv_nsec = now.tv_nsec;
+
+ set_timerfd_absolute(fd, &abs_time);
+
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+
+ expirations = 0;
+ memset(&now, 0, sizeof(struct timespec));
+ memset(&abs_time, 0, sizeof(struct timespec));
+
+ /* test timerfd with absolute time set in the past */
+ CHECK(clock_gettime(CLOCK_REALTIME, &now));
+ abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE;
+ abs_time.tv_nsec = now.tv_nsec;
+
+ set_timerfd_absolute(fd, &abs_time);
+
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+}
+
+static void test_read(int fd, bool non_blocking) {
+ if (non_blocking) {
+ CHECK(fcntl(fd, F_SETFL, O_NONBLOCK));
+ }
+
+ uint64_t expirations;
+ int retval = read(fd, &expirations, sizeof(expirations));
+
+ if (non_blocking) {
+ if (retval != -1 || errno != EAGAIN) {
+ errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN",
+ retval, errno);
+ }
+ } else {
+ CHECK(retval);
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "read: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ }
+}
+
+int main(void) {
+ int fds[NUM_FDS];
+ create_timerfds(fds);
+
+ set_timerfds_relative(fds, /*periodic*/false);
+ test_select(fds);
+
+ set_timerfds_relative(fds, /*periodic*/false);
+ test_poll(fds);
+
+ set_timerfds_relative(fds, /*periodic*/false);
+ test_epoll(fds);
+
+ set_timerfd_relative(fds[0], /*periodic*/true);
+ test_periodic_timer(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic*/true);
+ test_threaded_read(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic*/false);
+ test_timerfd_gettime(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic*/false);
+ test_read(fds[0], /*non_blocking=*/false);
+ test_read(fds[0], /*non_blocking=*/true);
+
+ test_absolute_time(fds[1]);
+
+ close_timerfds(fds);
+
+ puts("TEST OK");
+ return 0;
+}
diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c
new file mode 100644
index 0000000000..daa3874646
--- /dev/null
+++ b/libos/test/regression/timerfd_fork.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/* Multi-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and
+ * `timerfd_gettime()`).
+ *
+ * Note that timerfd is currently only emulated in a secure single-process mode, so this test does
+ * not work.
+ */
+
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common.h"
+
+#define EXPECTED_EXPIRATIONS 1
+#define TIMEOUT_VALUE 2
+
+static void set_timerfd(int fd) {
+ struct itimerspec new_value = { .it_value.tv_sec = TIMEOUT_VALUE };
+
+ CHECK(timerfd_settime(fd, 0, &new_value, NULL));
+}
+
+static void test_multi_process(int fd) {
+ pid_t pid = CHECK(fork());
+ if (pid == 0) {
+ uint64_t expirations;
+ /* child: wait on a blocking read for the timer to expire */
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "child process: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ exit(0);
+ } else {
+ int status = 0;
+
+ /* parent: do nothing and let the child process read the timerfd */
+ /* wait for the child process to exit */
+ CHECK(waitpid(pid, &status, 0));
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ errx(1, "child died with status: %#x", status);
+ }
+ }
+}
+
+int main(void) {
+ int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0));
+
+ set_timerfd(fd);
+ test_multi_process(fd);
+
+ CHECK(close(fd));
+
+ puts("TEST OK");
+ return 0;
+}