DataDog · jbachorik · Jan 9, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp
@@ -26,6 +26,141 @@
 #include "vmStructs.h"
 #include <math.h>
 #include <random>
+#include <cstdlib>
+
+class JVMTIThreads {
+ private:
+  jvmtiEnv* _jvmti;           // Pointer to JVMTI environment
+  JNIEnv* _jni;               // Pointer to JNI environment
+  jthread* _threads_ptr;      // Array of thread references
+  jint _threads_count;        // Number of threads
+
+ public:
+  // Constructor that takes a jvmtiEnv and retrieves all threads
+  JVMTIThreads(jvmtiEnv* jvmti, JNIEnv* jni) : _jvmti(jvmti), _jni(jni), _threads_ptr(nullptr), _threads_count(0) {
+    if (_jvmti && _jvmti->GetAllThreads(&_threads_count, &_threads_ptr) != JVMTI_ERROR_NONE) {
+      _threads_count = 0;
+      _threads_ptr = nullptr;
+    }
+  }
+
+  // Destructor to clean up resources
+  ~JVMTIThreads() {
+    if (_threads_ptr) {
+      // Delete local references of threads
+      for (jint i = 0; i < _threads_count; ++i) {
+        if (_threads_ptr[i]) {
+          _jni->DeleteLocalRef(_threads_ptr[i]);
+        }
+      }
+
+      // Deallocate memory for threads_ptr
+      _jvmti->Deallocate(reinterpret_cast<unsigned char*>(_threads_ptr));
+    }
+  }
+
+  // Disable copy constructor and assignment operator
+  JVMTIThreads(const JVMTIThreads&) = delete;
+  JVMTIThreads& operator=(const JVMTIThreads&) = delete;
+
+  // Allow move constructor and assignment operator
+  JVMTIThreads(JVMTIThreads&& other) noexcept
+    : _jvmti(other._jvmti), _jni(other._jni), _threads_ptr(other._threads_ptr), _threads_count(other._threads_count) {
+    other._jvmti = nullptr;
+    other._threads_ptr = nullptr;
+    other._threads_count = 0;
+  }
+
+  JVMTIThreads& operator=(JVMTIThreads&& other) noexcept {
+    if (this != &other) {
+      // Clean up current resources
+      this->~JVMTIThreads();
+
+      // Transfer ownership
+      _jvmti = other._jvmti;
+      _jni = other._jni;
+      _threads_ptr = other._threads_ptr;
+      _threads_count = other._threads_count;
+
+      other._jvmti = nullptr;
+      other._jni = nullptr;
+      other._threads_ptr = nullptr;
+      other._threads_count = 0;
+    }
+    return *this;
+  }
+
+  // Getters for thread count and thread pointer
+  jint count() const {
+      return _threads_count;
+  }
+
+  // Operator[] to access individual threads
+  jthread operator[](jint index) const noexcept {
+    if (index < 0 || index >= _threads_count) {
+      return nullptr;
+    }
+    return _threads_ptr[index];
+  }
+};
+
+template<typename T>
+class MoveToLocal {
+ private:
+   JNIEnv* _jni;  // JNI environment
+   T _ref;        // Local JNI reference
+ public:
+  MoveToLocal(JNIEnv* jni, T global_ref, bool is_weak) : _jni(jni) {
+    if (jni != nullptr) {
+      if (global_ref) {
+        _ref = jni->NewLocalRef(global_ref);
+        if (is_weak) {
+          _jni->DeleteWeakGlobalRef(global_ref);
+        } else {
+          _jni->DeleteGlobalRef(global_ref);
+        }
+      }
+    }
+  }
+
+  ~MoveToLocal() {
+    if (_jni != nullptr) {
+      if (_ref) {
+        _jni->DeleteLocalRef(_ref);
+      }
+    }
+  }
+
+  // Disable copy constructor and assignment operator
+  MoveToLocal(const MoveToLocal&) = delete;
+  MoveToLocal& operator=(const MoveToLocal&) = delete;
+
+  // Allow move constructor and assignment operator
+  MoveToLocal(MoveToLocal&& other) noexcept
+    : _jni(other._jni), _ref(other._ref) {
+    other._jni = nullptr;
+    other._ref = nullptr;
+  }
+
+  MoveToLocal& operator=(MoveToLocal&& other) noexcept {
+    if (this != &other) {
+      // Clean up current resources
+      this->~MoveToLocal();
+
+      // Transfer ownership
+      _jni = other._jni;
+      _ref = other._ref;
+
+      other._jni = nullptr;
+      other._ref = nullptr;
+    }
+    return *this;
+  }
+
+  T local() {
+    return _ref;
+  }
+};
 
 std::atomic<bool> BaseWallClock::_enabled{false};
 
@@ -154,56 +289,89 @@ void WallClockASGCT::initialize(Arguments& args) {
 }
 
 void WallClockJVMTI::timerLoop() {
-    // Check for enablement before attaching/dettaching the current thread
+  // Check for enablement before attaching/dettaching the current thread
   if (!isEnabled()) {
     return;
   }
   // Attach to JVM as the first step
-    VM::attachThread("Datadog Profiler Wallclock Sampler");
-    auto collectThreads = [&](std::vector<ThreadEntry>& threads) {
-        jvmtiEnv* jvmti = VM::jvmti();
-        if (jvmti == nullptr) {
-            return;
-        }
-        JNIEnv* jni = VM::jni();
-
-        jint threads_count = 0;
-        jthread* threads_ptr = nullptr;
-        jvmti->GetAllThreads(&threads_count, &threads_ptr);
-
-        bool do_filter = Profiler::instance()->threadFilter()->enabled();
-        int self = OS::threadId();
+  VM::attachThread("Datadog Profiler Wallclock Sampler");
+  auto collectThreads = [&](std::vector<ThreadEntry>& threads) {
+    jvmtiEnv* jvmti = VM::jvmti();
+    if (jvmti == nullptr) {
+        return;
+    }
+    JNIEnv* jni = VM::jni();
 
-        for (int i = 0; i < threads_count; i++) {
-          jthread thread = threads_ptr[i];
-          if (thread != nullptr) {
-            VMThread* nThread = VMThread::fromJavaThread(jni, thread);
-            if (nThread == nullptr) {
-              continue;
-            }
-            int tid = nThread->osThreadId();
-            if (tid != self && (!do_filter || Profiler::instance()->threadFilter()->accept(tid))) {
-              threads.push_back({nThread, thread});
+    // When we collect thread list via JVMTI the threads will be effectively local references
+    // These local references would be automatically cleaned up once we get back from the native call to Java
+    // But here we are in an endless loop, never leaving the native call and as such all those local references
+    //  would just keep on piling up
+    // Therefore, we have this neat RAII JMVTIThreads type which will take care of releasing the local references
+    //  once we are out of scope here
+    JVMTIThreads thread_array(jvmti, jni);
+    bool do_filter = Profiler::instance()->threadFilter()->enabled();
+    int self = OS::threadId();
+    for (int i = 0; i < thread_array.count(); i++) {
+      jthread thread = thread_array[i];
+      if (thread != nullptr) {
+        VMThread* nThread = VMThread::fromJavaThread(jni, thread);
+        if (nThread == nullptr) {
+          continue;
+        }
+        jint thread_state;
+        if (jvmti->GetThreadState(thread, &thread_state) == JVMTI_ERROR_NONE &&
+            (thread_state & JVMTI_THREAD_STATE_TERMINATED) == 0) {
+            // It might not always be possible to resolve native thread from a java thread
+            //   eg. when we are trying that very early in the thread startup
+            // In that case we would return -1 as the native tid and just skip that thread
+            int tid = VMThread::nativeThreadId(jni, thread);
+            if (tid != -1 && tid != self && (!do_filter || Profiler::instance()->threadFilter()->accept(tid))) {
+              // if NewWeakGlobalRef fails it will return 'nullptr' and we will skip it when the thread list is processed
+              threads.push_back({nThread, jni->NewWeakGlobalRef(thread)});
             }
-          }
         }
-        jvmti->Deallocate((unsigned char*)threads_ptr);
-    };
+      }
+    }
+  };
 
   auto sampleThreads = [&](ThreadEntry& thread_entry, int& num_failures, int& threads_already_exited, int& permission_denied) {
     static jint max_stack_depth = (jint)Profiler::instance()->max_stack_depth();
     static jvmtiFrameInfo* frame_buffer = new jvmtiFrameInfo[max_stack_depth];
     static jvmtiEnv* jvmti = VM::jvmti();
+    static JNIEnv* jni = VM::jni();
 
     int num_frames = 0;
-    jvmtiError err = jvmti->GetStackTrace(thread_entry.java, 0, max_stack_depth, frame_buffer, &num_frames);
+    if (thread_entry.java_ref == nullptr) {
+      num_failures++;
+      return false;
+    }
+
+    MoveToLocal<jthread> mtl(jni, thread_entry.java_ref, true);
+    jthread thread = mtl.local();
+    if (thread == nullptr) {
+        num_failures++;
+        return false;
+    }
+
+    jint thread_state;
+    jvmtiError state_err = jvmti->GetThreadState(thread, &thread_state);
+    if (state_err != JVMTI_ERROR_NONE || (thread_state & JVMTI_THREAD_STATE_TERMINATED) != 0) {
+        num_failures++;
+        return false;
+    }
+    jvmtiError err = jvmti->GetStackTrace(thread, 0, max_stack_depth, frame_buffer, &num_frames);
+
     if (err != JVMTI_ERROR_NONE) {
       num_failures++;
       if (err == JVMTI_ERROR_THREAD_NOT_ALIVE) {
         threads_already_exited++;
       }
       return false;
     }
+    if (num_frames == 0) {
+      // some JVMTI attached threads are Java-like but have no stack; we can just ignore them
+      return true;
+    }
     ExecutionEvent event;
     VMThread* vm_thread = thread_entry.native;
     int raw_thread_state = vm_thread->state();
@@ -267,6 +435,5 @@ void WallClockASGCT::timerLoop() {
       }
       return true;
     };
-
     timerLoopCommon<int>(collectThreads, sampleThreads, _reservoir_size, _interval);
 }
diff --git a/ddprof-lib/src/main/cpp/wallClock.h b/ddprof-lib/src/main/cpp/wallClock.h
@@ -39,9 +39,9 @@ class BaseWallClock : public Engine {
     // Profiler::recordSample().
     int _reservoir_size;
 
-      pthread_t _thread;
-      virtual void timerLoop() = 0;
-      virtual void initialize(Arguments& args) {};
+    pthread_t _thread;
+    virtual void timerLoop() = 0;
+    virtual void initialize(Arguments& args) {};
 
     static void *threadEntry(void *wall_clock) {
       ((BaseWallClock *)wall_clock)->timerLoop();
@@ -76,38 +76,46 @@ class BaseWallClock : public Engine {
 
       while (_running.load(std::memory_order_relaxed)) {
         collectThreads(threads);
+        int size = threads.size();
+        if (threads.size() > 0) {
+          int num_failures = 0;
+          int threads_already_exited = 0;
+          int permission_denied = 0;
+          std::vector<ThreadType> sample = reservoir.sample(threads);
+          for (ThreadType thread : sample) {
+            if (!sampleThreads(thread, num_failures, threads_already_exited, permission_denied)) {
+              continue;
+            }
+          }
 
-        int num_failures = 0;
-        int threads_already_exited = 0;
-        int permission_denied = 0;
-        std::vector<ThreadType> sample = reservoir.sample(threads);
-        for (ThreadType thread : sample) {
-          if (!sampleThreads(thread, num_failures, threads_already_exited, permission_denied)) {
-            continue;
+          epoch.updateNumSamplableThreads(threads.size());
+          epoch.updateNumFailedSamples(num_failures);
+          epoch.updateNumSuccessfulSamples(sample.size() - num_failures);
+          epoch.updateNumExitedThreads(threads_already_exited);
+          epoch.updateNumPermissionDenied(permission_denied);
+          u64 endTime = TSC::ticks();
+          u64 duration = TSC::ticks_to_millis(endTime - startTime);
+          if (epoch.hasChanged() || duration >= 1000) {
+            epoch.endEpoch(duration);
+            Profiler::instance()->recordWallClockEpoch(self, &epoch);
+            epoch.newEpoch(endTime);
+            startTime = endTime;
+          } else {
+            epoch.clean();
           }
-        }
 
-        epoch.updateNumSamplableThreads(threads.size());
-        epoch.updateNumFailedSamples(num_failures);
-        epoch.updateNumSuccessfulSamples(sample.size() - num_failures);
-        epoch.updateNumExitedThreads(threads_already_exited);
-        epoch.updateNumPermissionDenied(permission_denied);
-        u64 endTime = TSC::ticks();
-        u64 duration = TSC::ticks_to_millis(endTime - startTime);
-        if (epoch.hasChanged() || duration >= 1000) {
-          epoch.endEpoch(duration);
-          Profiler::instance()->recordWallClockEpoch(self, &epoch);
-          epoch.newEpoch(endTime);
-          startTime = endTime;
-        } else {
-          epoch.clean();
+          threads.clear();
         }
-
-        threads.clear();
         // Get a random sleep duration
-        // clamp the random interval to <1,2N-1>
-        // the probability of clamping is extremely small, close to zero
-        OS::sleep(std::min(std::max((long int)1, static_cast<long int>(distribution(generator))), ((_interval * 2) - 1)));
+        // Restrict the random interval to <N/2,N+N/2>
+        // With the given parameters N/2 is 5 standard deviations from the mean ->
+        //  the probability of falling outside of the given range is ~0.000058%
+        long limit = _interval / 2;
+        long int delay = _interval;
+        do {
+          delay = static_cast<long int>(distribution(generator));
+        } while (std::abs(delay - _interval) > limit);
+        OS::sleep(delay);
       }
     }
 
@@ -159,8 +167,8 @@ class WallClockJVMTI : public BaseWallClock {
     void timerLoop() override;
   public:
     struct ThreadEntry {
-        VMThread* native;
-        jthread java;
+      VMThread* native;
+      jobject java_ref;
     };
     WallClockJVMTI() : BaseWallClock() {}
     const char* name() override {