cloudius-systems · nmeum · Oct 28, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/core/sched.cc b/core/sched.cc
@@ -276,6 +276,10 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
     }
     thread* p = thread::current();
 
+    if (p->_realtime.has_slice() && p->_realtime._priority > 0) {
+        p->_realtime._run_time += interval;
+    }
+
     const auto p_status = p->_detached_state->st.load();
     assert(p_status != thread::status::queued);
 
@@ -295,8 +299,18 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
         } else if (!called_from_yield) {
             auto &t = *runqueue.begin();
             if (p->_realtime._priority > 0) {
-                // Only switch to a higher-priority realtime thread
-                if (t._realtime._priority <= p->_realtime._priority) {
+                // If the threads have the same realtime priority, then only reschedule
+                // if the currently executed thread has exceeded its time slice (if any).
+                if (t._realtime._priority == p->_realtime._priority &&
+                    ((!p->_realtime.has_slice() || p->_realtime.has_remaining()))) {
+#ifdef __aarch64__
+                    return switch_data;
+#else
+                    return;
+#endif
+                // Otherwise, don't switch to a lower-priority realtime thread,
+                // no matter how much time slice was used by the running thread.
+                } else if (t._realtime._priority < p->_realtime._priority) {
 #ifdef __aarch64__
                     return switch_data;
 #else
@@ -336,15 +350,27 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
         p->_detached_state->st.store(thread::status::queued);
 
         if (!called_from_yield) {
-            // POSIX requires that if a real-time thread doesn't yield but
-            // rather is preempted by a higher-priority thread, it be
-            // reinserted into the runqueue first, not last, among its equals.
-            enqueue_first_equal(*p);
+            if (p->_realtime.has_slice() && !p->_realtime.has_remaining()) {
+                // The real-time thread has exceeded it's time slice, enqueue
+                // it at the end among its equals and reset the time slice.
+                enqueue(*p);
+                p->_realtime.reset_slice();
+            } else {
+                // POSIX requires that if a real-time thread doesn't yield but
+                // rather is preempted by a higher-priority thread, it be
+                // reinserted into the runqueue first, not last, among its equals.
+                enqueue_first_equal(*p);
+            }
         }
 
         trace_sched_preempt();
         p->stat_preemptions.incr();
     } else {
+        // p is no longer running, if it has a realtime slice reset it.
+        if (p->_realtime.has_slice()) {
+            p->_realtime.reset_slice();
+        }
+
         // p is no longer running, so we'll switch to a different thread.
         // Return the runtime p borrowed for hysteresis.
         p->_runtime.hysteresis_run_stop();
@@ -392,6 +418,9 @@ void cpu::reschedule_from_interrupt(bool called_from_yield,
         } else {
             preemption_timer.set_with_irq_disabled(now + preempt_after);
         }
+    } else if (n->_realtime.has_slice()) {
+        assert(n->_realtime.has_remaining());
+        preemption_timer.set_with_irq_disabled(now + n->_realtime.remaining());
     }
 
     if (app_thread.load(std::memory_order_relaxed) != n->_app) { // don't write into a cache line if it can be avoided
@@ -920,10 +949,6 @@ unsigned thread::realtime_priority() const
 
 void thread::set_realtime_time_slice(thread_realtime::duration time_slice)
 {
-    if (time_slice > 0) {
-        WARN_ONCE("set_realtime_time_slice() used but real-time time slices"
-                " not yet supported");
-    }
     _realtime._time_slice = time_slice;
 }
 

diff --git a/include/osv/sched.hh b/include/osv/sched.hh
@@ -348,6 +348,25 @@ public:
     using duration = thread_runtime::duration;
     unsigned _priority = 0;
     duration _time_slice = duration::zero();
+
+    // Total time this thread ran since starting this slice.
+    duration _run_time = duration::zero();
+
+    void reset_slice() {
+        _run_time = duration::zero();
+    }
+
+    bool has_slice() const {
+        return _time_slice != duration::zero();
+    }
+
+    bool has_remaining() const {
+        return _time_slice > _run_time;
+    }
+
+    duration remaining() const {
+        return _time_slice - _run_time;
+    }
 };
 
 // "tau" controls the length of the history we consider for scheduling,
@@ -691,6 +710,8 @@ public:
      * With time_slice == 0, the real-time scheduling policy matches POSIX's
      * "SCHED_FIFO" policy. With time_slice > 0, it matches POSIX's "SCHED_RR"
      * policy.
+     *
+     * Note: The time_slice should be set before the thread is started.
      */
     void set_realtime_time_slice(thread_realtime::duration time_slice);
     /**

diff --git a/modules/tests/Makefile b/modules/tests/Makefile
@@ -146,7 +146,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \
 	tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \
 	tst-netlink.so misc-zfs-io.so misc-zfs-arc.so tst-pthread-create.so \
 	misc-futex-perf.so misc-syscall-perf.so tst-brk.so tst-reloc.so \
-	misc-vdso-perf.so tst-string-utils.so
+	misc-vdso-perf.so tst-string-utils.so tst-thread-realtime.so
 #	libstatic-thread-variable.so tst-static-thread-variable.so \
 #	tst-f128.so \
 

diff --git a/tests/tst-thread-realtime.cc b/tests/tst-thread-realtime.cc
@@ -0,0 +1,128 @@
+#include <atomic>
+#include <string>
+#include <iostream>
+#include <osv/sched.hh>
+
+#define TIME_SLICE 100000000
+
+static std::string name = "tst-thr-wrk";
+static int tests = 0, fails = 0;
+
+static void report(bool ok, std::string msg)
+{
+    ++tests;
+    fails += !ok;
+    std::cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n";
+}
+
+static int fac(int n)
+{
+    if (n == 0) {
+        return 0;
+    } else if (n == 1) {
+        return 1;
+    } else {
+        return fac(n - 1) + fac(n - 2);
+    }
+}
+
+bool runtime_equalized()
+{
+    constexpr int num_threads = 5;
+    constexpr int switches = 3;
+    static std::atomic<bool> stop_threads;
+    sched::thread *threads[num_threads];
+
+    sched::cpu *c = sched::cpu::current();
+    for (int i = 0; i < num_threads; i++) {
+        threads[i] = sched::thread::make([&]{
+                while (!stop_threads.load()) {
+                    fac(10);
+                }
+        }, sched::thread::attr().name(name));
+
+        threads[i]->pin(c);
+        threads[i]->set_realtime_priority(1);
+        threads[i]->set_realtime_time_slice(std::chrono::nanoseconds(TIME_SLICE));
+        threads[i]->start();
+    }
+
+    auto runtime = std::chrono::nanoseconds(TIME_SLICE * num_threads * switches);
+    sched::thread::sleep(runtime);
+    stop_threads = true;
+
+    bool ok = true;
+    long prev_switches = -1;
+    for (int i = 0; i < num_threads; i++) {
+        long switches = threads[i]->stat_switches.get();
+        if (prev_switches != -1 && prev_switches != switches) {
+            ok = false;
+            break;
+        }
+        prev_switches = switches;
+    }
+
+    for (int i = 0; i < num_threads; i++) {
+        delete threads[i];
+    }
+
+    return ok;
+}
+
+bool priority_precedence()
+{
+    static std::atomic<bool> high_prio_stop;
+    sched::thread *high_prio = sched::thread::make([&]{
+        while (!high_prio_stop.load()) {
+            fac(10);
+        }
+    });
+
+    static std::atomic<bool> low_prio_stop;
+    sched::thread *low_prio = sched::thread::make([&]{
+        while (!low_prio_stop.load()) {
+            fac(10);
+        }
+    });
+
+    sched::cpu *c = sched::cpu::current();
+    high_prio->pin(c);
+    low_prio->pin(c);
+
+    high_prio->set_realtime_priority(2);
+    low_prio->set_realtime_priority(1);
+
+    // The higher priority thread has a time slice, but since there is no other thread
+    // with the same priority, it should monopolize the CPU and lower_prio shouldn't run.
+    high_prio->set_realtime_time_slice(sched::thread_realtime::duration(TIME_SLICE));
+
+    high_prio->start();
+    low_prio->start();
+
+    auto runtime = std::chrono::nanoseconds(TIME_SLICE * 3);
+    sched::thread::sleep(runtime);
+
+    // Since both threads are pinned to the CPU and the higher priority
+    // thread is always runnable, the lower priority thread should starve.
+    bool ok = high_prio->thread_clock().count() > 0 &&
+        low_prio->thread_clock().count() == 0;
+
+    low_prio_stop = true;
+    high_prio_stop = true;
+
+    delete high_prio;
+    delete low_prio;
+
+    return ok;
+}
+
+int main(int ac, char** av)
+{
+    // Ensure that the main thread doesn't starve.
+    sched::thread *cur = sched::thread::current();
+    cur->set_realtime_priority(10);
+
+    report(runtime_equalized(), "runtime_equalized");
+    report(priority_precedence(), "priority_precedence");
+    std::cout << "SUMMARY: " << tests << " tests, " << fails << " failures\n";
+}