From 9fdae8c618807156499d331680c5cc84b907d2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Mon, 28 Oct 2024 16:25:10 +0100 Subject: [PATCH 1/7] sched: Support POSIX's SCHED_RR scheduling policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, pull request #1223 added support for the SCHED_FIFO but didn't implement SCHED_RR. This PR attempts to follow-up on that by proposing an implementation of SCHED_RR. Time slicing support, as mandated by SCHED_RR, is implemented though the `set_realtime_time_slice(duration)` API added in the aforementioned pull request. Within the scheduler, the amount of time the thread ran so far is tracked if it exceeds the set duration, the thread is preempted (if there is a runnable thread of same or higher priority). The thread's run time is reset on preemption. Signed-off-by: Sören Tempel --- core/sched.cc | 46 ++++++++++--- include/osv/sched.hh | 19 ++++++ modules/tests/Makefile | 2 +- tests/tst-thread-realtime.cc | 128 +++++++++++++++++++++++++++++++++++ 4 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 tests/tst-thread-realtime.cc diff --git a/core/sched.cc b/core/sched.cc index e1736f0406..6e0fae409b 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -276,6 +276,10 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, } thread* p = thread::current(); + if (p->_realtime.has_slice()) { + p->_realtime._run_time += interval; + } + const auto p_status = p->_detached_state->st.load(); assert(p_status != thread::status::queued); @@ -295,8 +299,17 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, } else if (!called_from_yield) { auto &t = *runqueue.begin(); if (p->_realtime._priority > 0) { - // Only switch to a higher-priority realtime thread - if (t._realtime._priority <= p->_realtime._priority) { + // If the threads have the same realtime priority, then only reschedule + // if the currently executed thread has exceeded its time slice (if any). + if (t._realtime._priority == p->_realtime._priority && + ((!p->_realtime.has_slice() || p->_realtime.has_remaining()))) { +#ifdef __aarch64__ + return switch_data; +#else + return; +#endif + // Otherwise, only switch to a higher-priority realtime thread. + } else if (t._realtime._priority < p->_realtime._priority) { #ifdef __aarch64__ return switch_data; #else @@ -336,15 +349,27 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, p->_detached_state->st.store(thread::status::queued); if (!called_from_yield) { - // POSIX requires that if a real-time thread doesn't yield but - // rather is preempted by a higher-priority thread, it be - // reinserted into the runqueue first, not last, among its equals. - enqueue_first_equal(*p); + if (p->_realtime.has_slice() && !p->_realtime.has_remaining()) { + // The real-time thread has exceeded it's time slice, enqueue + // it at the end among its equals and reset the time slice. + enqueue(*p); + p->_realtime.reset_slice(); + } else { + // POSIX requires that if a real-time thread doesn't yield but + // rather is preempted by a higher-priority thread, it be + // reinserted into the runqueue first, not last, among its equals. + enqueue_first_equal(*p); + } } trace_sched_preempt(); p->stat_preemptions.incr(); } else { + // p is no longer running, if it has a realtime slice reset it. + if (p->_realtime.has_slice()) { + p->_realtime.reset_slice(); + } + // p is no longer running, so we'll switch to a different thread. // Return the runtime p borrowed for hysteresis. p->_runtime.hysteresis_run_stop(); @@ -380,7 +405,10 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, n->_runtime.add_context_switch_penalty(); } preemption_timer.cancel(); - if (n->_realtime._priority == 0) { + if (n->_realtime.has_slice()) { + assert(n->_realtime.has_remaining()); + preemption_timer.set_with_irq_disabled(now + n->_realtime.remaining()); + } else if (n->_realtime._priority == 0) { if (!called_from_yield) { if (!runqueue.empty()) { auto& t = *runqueue.begin(); @@ -920,10 +948,6 @@ unsigned thread::realtime_priority() const void thread::set_realtime_time_slice(thread_realtime::duration time_slice) { - if (time_slice > 0) { - WARN_ONCE("set_realtime_time_slice() used but real-time time slices" - " not yet supported"); - } _realtime._time_slice = time_slice; } diff --git a/include/osv/sched.hh b/include/osv/sched.hh index 0ed8fa11e7..6488faaf99 100644 --- a/include/osv/sched.hh +++ b/include/osv/sched.hh @@ -348,6 +348,25 @@ public: using duration = thread_runtime::duration; unsigned _priority = 0; duration _time_slice = duration::zero(); + + // Total time this thread ran since starting this slice. + duration _run_time = duration::zero(); + + void reset_slice() { + _run_time = duration::zero(); + } + + bool has_slice() const { + return _time_slice != duration::zero(); + } + + bool has_remaining() const { + return _time_slice > _run_time; + } + + duration remaining() const { + return _time_slice - _run_time; + } }; // "tau" controls the length of the history we consider for scheduling, diff --git a/modules/tests/Makefile b/modules/tests/Makefile index 52596e419d..3e7d3dba9f 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -146,7 +146,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \ tst-netlink.so misc-zfs-io.so misc-zfs-arc.so tst-pthread-create.so \ misc-futex-perf.so misc-syscall-perf.so tst-brk.so tst-reloc.so \ - misc-vdso-perf.so tst-string-utils.so + misc-vdso-perf.so tst-string-utils.so tst-thread-realtime.so # libstatic-thread-variable.so tst-static-thread-variable.so \ # tst-f128.so \ diff --git a/tests/tst-thread-realtime.cc b/tests/tst-thread-realtime.cc new file mode 100644 index 0000000000..bbd3a3bbf7 --- /dev/null +++ b/tests/tst-thread-realtime.cc @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +#define TIME_SLICE 100000 + +static std::string name = "tst-thr-wrk"; +static int tests = 0, fails = 0; + +static void report(bool ok, std::string msg) +{ + ++tests; + fails += !ok; + std::cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n"; +} + +static int fac(int n) +{ + if (n == 0) { + return 0; + } else if (n == 1) { + return 1; + } else { + return fac(n - 1) + fac(n - 2); + } +} + +bool runtime_equalized() +{ + constexpr int num_threads = 5; + constexpr int switches = 3; + static std::atomic stop_threads; + sched::thread *threads[num_threads]; + + sched::cpu *c = sched::cpu::current(); + for (int i = 0; i < num_threads; i++) { + threads[i] = sched::thread::make([&]{ + while (!stop_threads.load()) { + fac(10); + } + }, sched::thread::attr().name(name)); + + threads[i]->pin(c); + threads[i]->set_realtime_priority(1); + threads[i]->set_realtime_time_slice(sched::thread_realtime::duration(TIME_SLICE)); + threads[i]->start(); + } + + auto runtime = sched::thread_realtime::duration(TIME_SLICE * num_threads * switches); + sched::thread::sleep(runtime); + stop_threads = true; + + bool ok = true; + long prev_switches = -1; + for (int i = 0; i < num_threads; i++) { + long switches = threads[i]->stat_switches.get(); + if (prev_switches != -1 && prev_switches != switches) { + ok = false; + break; + } + prev_switches = switches; + } + + for (int i = 0; i < num_threads; i++) { + delete threads[i]; + } + + return ok; +} + +bool priority_precedence() +{ + static std::atomic high_prio_stop; + sched::thread *high_prio = sched::thread::make([&]{ + while (!high_prio_stop.load()) { + fac(10); + } + }); + + static std::atomic low_prio_stop; + sched::thread *low_prio = sched::thread::make([&]{ + while (!low_prio_stop.load()) { + fac(10); + } + }); + + sched::cpu *c = sched::cpu::current(); + high_prio->pin(c); + low_prio->pin(c); + + high_prio->set_realtime_priority(2); + low_prio->set_realtime_priority(1); + + // The higher priority thread has a time slice, but since there is no other thread + // with the same priority, it should monopolize the CPU and lower_prio shouldn't run. + high_prio->set_realtime_time_slice(sched::thread_realtime::duration(TIME_SLICE)); + + high_prio->start(); + low_prio->start(); + + auto runtime = sched::thread_realtime::duration(TIME_SLICE * 3); + sched::thread::sleep(runtime); + + // Since both threads are pinned to the CPU and the higher priority + // thread is always runnable, the lower priority thread should starve. + bool ok = high_prio->thread_clock().count() > 0 && + low_prio->thread_clock().count() == 0; + + low_prio_stop = true; + high_prio_stop = true; + + delete high_prio; + delete low_prio; + + return ok; +} + +int main(int ac, char** av) +{ + // Ensure that the main thread doesn't starve. + sched::thread *cur = sched::thread::current(); + cur->set_realtime_priority(10); + + report(runtime_equalized(), "runtime_equalized"); + report(priority_precedence(), "priority_precedence"); + std::cout << "SUMMARY: " << tests << " tests, " << fails << " failures\n"; +} From 25357218742054b97f62c37844c6381fb70790f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 02:52:05 +0100 Subject: [PATCH 2/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- include/osv/sched.hh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/osv/sched.hh b/include/osv/sched.hh index 6488faaf99..af71a28123 100644 --- a/include/osv/sched.hh +++ b/include/osv/sched.hh @@ -710,6 +710,8 @@ public: * With time_slice == 0, the real-time scheduling policy matches POSIX's * "SCHED_FIFO" policy. With time_slice > 0, it matches POSIX's "SCHED_RR" * policy. + * + * Note: The time_slice should be set before the thread is started. */ void set_realtime_time_slice(thread_realtime::duration time_slice); /** From 8db344454f4e454d6660c83e4666c4769ad0e7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 21:24:31 +0100 Subject: [PATCH 3/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- core/sched.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/sched.cc b/core/sched.cc index 6e0fae409b..e82efad902 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -276,7 +276,7 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, } thread* p = thread::current(); - if (p->_realtime.has_slice()) { + if (p->_realtime.has_slice() && p->_realtime._priority > 0) { p->_realtime._run_time += interval; } From b2f1957269d46d41b66c06b2849e402e443ad667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 21:22:26 +0100 Subject: [PATCH 4/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- core/sched.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/sched.cc b/core/sched.cc index e82efad902..fe39ea50a2 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -308,7 +308,8 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, #else return; #endif - // Otherwise, only switch to a higher-priority realtime thread. + // Otherwise, don't switch to a lower-priority realtime thread, + // no matter how much time slice was used by the running thread. } else if (t._realtime._priority < p->_realtime._priority) { #ifdef __aarch64__ return switch_data; From 002e0a97ad2316ab4ded8266ae6c444fb3b2e495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 11:22:09 +0100 Subject: [PATCH 5/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- core/sched.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/sched.cc b/core/sched.cc index fe39ea50a2..44e24dc2af 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -406,10 +406,7 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, n->_runtime.add_context_switch_penalty(); } preemption_timer.cancel(); - if (n->_realtime.has_slice()) { - assert(n->_realtime.has_remaining()); - preemption_timer.set_with_irq_disabled(now + n->_realtime.remaining()); - } else if (n->_realtime._priority == 0) { + if (n->_realtime._priority == 0) { if (!called_from_yield) { if (!runqueue.empty()) { auto& t = *runqueue.begin(); @@ -421,6 +418,9 @@ void cpu::reschedule_from_interrupt(bool called_from_yield, } else { preemption_timer.set_with_irq_disabled(now + preempt_after); } + } else if (n->_realtime.has_slice()) { + assert(n->_realtime.has_remaining()); + preemption_timer.set_with_irq_disabled(now + n->_realtime.remaining()); } if (app_thread.load(std::memory_order_relaxed) != n->_app) { // don't write into a cache line if it can be avoided From e9ea1a0a4cbdf27e2f2386598abd9ee484649527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 20:47:27 +0100 Subject: [PATCH 6/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- tests/tst-thread-realtime.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tst-thread-realtime.cc b/tests/tst-thread-realtime.cc index bbd3a3bbf7..c6b752db00 100644 --- a/tests/tst-thread-realtime.cc +++ b/tests/tst-thread-realtime.cc @@ -43,11 +43,11 @@ bool runtime_equalized() threads[i]->pin(c); threads[i]->set_realtime_priority(1); - threads[i]->set_realtime_time_slice(sched::thread_realtime::duration(TIME_SLICE)); + threads[i]->set_realtime_time_slice(std::chrono::nanoseconds(TIME_SLICE)); threads[i]->start(); } - auto runtime = sched::thread_realtime::duration(TIME_SLICE * num_threads * switches); + auto runtime = std::chrono::nanoseconds(TIME_SLICE * num_threads * switches); sched::thread::sleep(runtime); stop_threads = true; @@ -99,7 +99,7 @@ bool priority_precedence() high_prio->start(); low_prio->start(); - auto runtime = sched::thread_realtime::duration(TIME_SLICE * 3); + auto runtime = std::chrono::nanoseconds(TIME_SLICE * 3); sched::thread::sleep(runtime); // Since both threads are pinned to the CPU and the higher priority From 3c27113e3e6e2677db99384c42dad8c2c864f4fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Tempel?= Date: Wed, 4 Dec 2024 12:52:09 +0100 Subject: [PATCH 7/7] fixup! sched: Support POSIX's SCHED_RR scheduling policy --- tests/tst-thread-realtime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tst-thread-realtime.cc b/tests/tst-thread-realtime.cc index c6b752db00..bd963508b5 100644 --- a/tests/tst-thread-realtime.cc +++ b/tests/tst-thread-realtime.cc @@ -3,7 +3,7 @@ #include #include -#define TIME_SLICE 100000 +#define TIME_SLICE 100000000 static std::string name = "tst-thr-wrk"; static int tests = 0, fails = 0;