diff --git a/CMakeLists.txt b/CMakeLists.txt index e9bdeef..5b3c06f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,17 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library") add_subdirectory("src/llama.cpp") +# apply patches +set(PATCH_FILE ${CMAKE_SOURCE_DIR}/patches/llama.patch) +add_custom_target(patch) +add_custom_command( + TARGET patch + COMMAND patch -N -p1 < ${PATCH_FILE} || true + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp + +) +add_dependencies(ggml patch) + file( GLOB SOURCE_FILES "src/addons.cc" @@ -95,6 +106,7 @@ if (LLAMA_QNN) message(FATAL_ERROR "QNN is not supported on this platform") endif() set(QNN_LIB_PATH ${QNN_ROOT}/lib/${QNN_PLATFORM}) + message(STATUS "QNN_LIB_PATH: ${QNN_LIB_PATH}") file( GLOB QNN_SO_FILES @@ -118,33 +130,22 @@ if (LLAMA_QNN) file( GLOB QNN_HEADER_FILES - "src/ggml-qnn.h" + "src/ggml-qnn/ggml-qnn.h" ) file( GLOB QNN_SOURCE_FILES - "src/ggml-qnn.cpp" + "src/ggml-qnn/pthread-shim.h" + "src/ggml-qnn/ggml-qnn.cpp" ) target_compile_definitions(ggml PUBLIC GGML_USE_QNN) - target_include_directories(ggml PUBLIC ${QNN_ROOT}/include/QNN) + target_include_directories(ggml PUBLIC ${QNN_ROOT}/include ${QNN_ROOT}/include/QNN) target_sources(ggml PRIVATE ${QNN_SOURCE_FILES} ${QNN_HEADER_FILES}) - target_include_directories(llama PRIVATE "src") + target_include_directories(llama PRIVATE "src/ggml-qnn") set_target_properties(ggml PROPERTIES CXX_STANDARD 17) - - # apply patches/qnn.patch to ggml - add_custom_command( - OUTPUT ${CMAKE_BUILD_DIR}/patch.log - COMMAND git apply ${CMAKE_SOURCE_DIR}/patches/qnn.patch - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp - ) -else() - # undo patches/qnn.patch to ggml - add_custom_command( - OUTPUT ${CMAKE_BUILD_DIR}/patch.log - COMMAND git apply -R ${CMAKE_SOURCE_DIR}/patches/qnn.patch - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp - ) + set_target_properties(ggml PROPERTIES CXX_STANDARD_REQUIRED ON) + set_target_properties(ggml PROPERTIES C_STANDARD 11) endif() add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC}) diff --git a/package.json b/package.json index fae134a..462098a 100644 --- a/package.json +++ b/package.json @@ -39,10 +39,8 @@ }, "files": [ "bin/**/*", - "scripts/*.js", - "scripts/*.ts", - "src/*", - "externals/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}", + "patches/*", + "src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}", "lib/*.js", "lib/*.ts", "CMakeLists.txt" diff --git a/patches/qnn.patch b/patches/llama.patch similarity index 77% rename from patches/qnn.patch rename to patches/llama.patch index 3bbe1e1..156d5e7 100644 --- a/patches/qnn.patch +++ b/patches/llama.patch @@ -1,16 +1,7 @@ diff --git a/ggml-backend.c b/ggml-backend.c -index f5bdcf07..536a5767 100644 +index e91d97cd..be4989d3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c -@@ -416,7 +416,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { - } - - initialized = true; -- -+ printf("GGML_USE_CPU\n"); - ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); - - // add forward decls here to avoid including the backend headers @@ -445,6 +445,10 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL void ggml_backend_kompute_reg_devices(void); ggml_backend_kompute_reg_devices(); @@ -23,7 +14,7 @@ index f5bdcf07..536a5767 100644 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/llama.cpp b/llama.cpp -index 18d6297c..f2a39613 100644 +index a25d115c..ff0d929f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17,6 +17,8 @@ @@ -35,7 +26,7 @@ index 18d6297c..f2a39613 100644 #endif #ifdef GGML_USE_METAL -@@ -1679,6 +1681,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { +@@ -1658,6 +1660,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { buft = ggml_backend_opencl_buffer_type(); #elif defined(GGML_USE_KOMPUTE) buft = ggml_backend_kompute_buffer_type(gpu); @@ -44,18 +35,16 @@ index 18d6297c..f2a39613 100644 if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } -@@ -15293,8 +15297,9 @@ bool llama_supports_mlock(void) { +@@ -14916,7 +14920,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. -+ printf("llama_supports_gpu_offload: true\n"); return true; #else - return false; -@@ -15607,6 +15612,16 @@ struct llama_context * llama_new_context_with_model( +@@ -15203,6 +15207,16 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } diff --git a/src/ggml-qnn.cpp b/src/ggml-qnn/ggml-qnn.cpp similarity index 98% rename from src/ggml-qnn.cpp rename to src/ggml-qnn/ggml-qnn.cpp index ef0a136..42e9325 100644 --- a/src/ggml-qnn.cpp +++ b/src/ggml-qnn/ggml-qnn.cpp @@ -50,10 +50,21 @@ #include #include #include +#include +#ifdef WIN32 +#include +#include +#include "pthread-shim.h" + +#define R_OK 4 /* Test for read permission. */ +#define W_OK 2 /* Test for write permission. */ +#define F_OK 0 /* Test for existence. */ +#define access _access +#else #include #include #include -#include +#endif #include #include @@ -79,16 +90,21 @@ #include #ifdef __cplusplus - #include - using std::atomic_int; - using std::atomic_bool; - using std::atomic_load; - using std::atomic_fetch_sub; - using std::atomic_store; +#include +using std::atomic_int; +using std::atomic_bool; +using std::atomic_load; +using std::atomic_fetch_sub; +using std::atomic_store; #else /* not __cplusplus */ - #include +#include #endif /* __cplusplus */ +// dummy fix https://github.com/skypjack/entt/issues/615#issuecomment-749511697 +#ifdef WIN32 +#define interface interface_ +#endif + #include "QnnTypes.h" #include "QnnCommon.h" #include "QnnContext.h" @@ -367,11 +383,31 @@ static void ggml_setup_op_has_task_pass(void) { //in GGML internal or FFmpeg //QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +#ifdef WIN32 +/*struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + qnn_buf_t * buffer_pool; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ;*/ + +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + {0, 1, "qnn-cpu", "QnnCpu.dll", nullptr, nullptr, nullptr, nullptr, nullptr}, + {1, 1, "qnn-gpu", "QnnGpu.dll", nullptr, nullptr, nullptr, nullptr, nullptr}, + {2, 1, "qnn-htp(aka dsp)", "QnnHtp.dll", nullptr, nullptr, nullptr, nullptr, nullptr} +}; +#else static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, }; +#endif @@ -889,7 +925,17 @@ static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copyS static char * ggml_qnn_strndup(const char * source, size_t maxlen) { - return ::strndup(source, maxlen); +#ifdef WIN32 + char * dest = (char *)malloc(maxlen + 1); + if (dest == nullptr) { + return nullptr; + } + strncpy_s(dest, maxlen + 1, source, maxlen); + dest[maxlen] = '\0'; + return dest; +#else + return ::strndup(source, maxlen); +#endif } @@ -1118,7 +1164,11 @@ static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { template Fn load_qnn_functionpointers(void * handle, const char * function_name) { +#ifdef WIN32 + return reinterpret_cast(GetProcAddress(reinterpret_cast(handle), function_name)); +#else return reinterpret_cast(dlsym(handle, function_name)); +#endif } @@ -2034,19 +2084,35 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); +#ifdef WIN32 + void *lib_handle = LoadLibrary(lib_path.c_str()); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %d", lib_path.c_str(), GetLastError()); + return 1; + } +#else void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; } +#endif // load get_provider function +#ifdef WIN32 + auto get_providers = (_pfn_QnnInterface_getProviders *)GetProcAddress((HMODULE)lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %d", GetLastError()); + return 2; + } +#else auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); return 2; } +#endif // get QnnInterface Providers std::uint32_t num_providers = 0; @@ -2094,10 +2160,14 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); +#ifdef WIN32 + FreeLibrary((HMODULE)_loaded_lib_handle[backend_id]); +#else int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } +#endif } _loaded_lib_handle[backend_id] = lib_handle; _backend_id = backend_id; @@ -2136,6 +2206,11 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * int qnn_instance::unload_backend() { ENTER_FUNC(); +#ifdef WIN32 + for (auto &it : _loaded_lib_handle) { + FreeLibrary((HMODULE)it.second); + } +#else int dlclose_error = 0; for (auto &it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); @@ -2143,6 +2218,7 @@ int qnn_instance::unload_backend() { QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } } +#endif _loaded_lib_handle.clear(); _lib_path_to_backend_id.clear(); @@ -2161,6 +2237,20 @@ int qnn_instance::load_system() { std::string system_lib_path = _lib_path + "libQnnSystem.so"; QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); +#ifdef WIN32 + _system_lib_handle = LoadLibrary(system_lib_path.c_str()); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %d\n", system_lib_path.c_str(), GetLastError()); + return 1; + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(GetProcAddress( + (HMODULE)_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %d\n", GetLastError()); + return 2; + } +#else _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (nullptr == _system_lib_handle) { QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); @@ -2173,6 +2263,7 @@ int qnn_instance::load_system() { QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); return 2; } +#endif uint32_t num_providers = 0; const QnnSystemInterface_t ** provider_list = nullptr; @@ -2243,11 +2334,15 @@ int qnn_instance::unload_system() { _qnn_system_handle = nullptr; } +#ifdef WIN32 + FreeLibrary((HMODULE)_system_lib_handle); +#else int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); return 2; } +#endif _system_lib_handle = nullptr; LEAVE_FUNC(); @@ -2406,7 +2501,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } -#ifdef __ANDROID__ +#if 0 // Latest SDK seems no libcdsprpc _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); if (nullptr == _rpc_lib_handle) { QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); @@ -2458,7 +2553,7 @@ int qnn_instance::qnn_finalize() { if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy _pfn_rpc_mem_deinit(); -#ifdef __ANDROID__ +#if 0 if (dlclose(_rpc_lib_handle) != 0) { QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); } else { @@ -4120,11 +4215,15 @@ static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t static void * ggml_qnn_host_malloc(size_t n) { void * data = nullptr; +#ifdef WIN32 + data = _aligned_malloc(n, 32); +#else const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; } +#endif return data; } @@ -4136,12 +4235,20 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_back ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; +#ifdef WIN32 + const size_t size_page = 4096; + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } +#else const size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); } +#endif //QNN_LOG_DEBUG("size %d, %d MB", size_aligned, size_aligned / (1 << 20)); @@ -4538,7 +4645,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput while (true) { if (do_yield) { +#ifdef WIN32 + Sleep(0); +#else sched_yield(); +#endif } * node_n = atomic_load(&state->shared->node_n); @@ -4553,7 +4664,11 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co while (true) { if (do_yield) { +#ifdef WIN32 + Sleep(0); +#else sched_yield(); +#endif } * task_phase = atomic_load(&state->shared->node_task); @@ -4982,12 +5097,7 @@ static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t bac // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; + workers[j] = { 0, j, &state_shared, GGML_STATUS_SUCCESS }; const int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); GGML_ASSERT(rc == 0); @@ -5154,7 +5264,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { } // if build for windows, PATH_DELIMITER is '\' -#ifdef _WINDOWS_ +#ifdef WIN32 #define PATH_DELIMITER '\\' #define QNN_SYS_LIB_NAME "QnnSystem.dll" diff --git a/src/ggml-qnn.h b/src/ggml-qnn/ggml-qnn.h similarity index 100% rename from src/ggml-qnn.h rename to src/ggml-qnn/ggml-qnn.h diff --git a/src/ggml-qnn/pthread-shim.h b/src/ggml-qnn/pthread-shim.h new file mode 100644 index 0000000..eb547b8 --- /dev/null +++ b/src/ggml-qnn/pthread-shim.h @@ -0,0 +1,199 @@ +#ifdef _MSC_VER + +#include +#include +#include + +typedef HANDLE pthread_mutex_t; +typedef int pthread_condattr_t; +typedef HANDLE pthread_t; +typedef DWORD pthread_attr_t; + +#define pthread_create(thrp, attr, func, arg) \ + (((*(thrp) = CreateThread(NULL, 0, \ + (LPTHREAD_START_ROUTINE)(func), (arg), 0, NULL)) == NULL) ? -1 : 0) +#define pthread_join(thr, statusp) \ + ((WaitForSingleObject((thr), INFINITE) == WAIT_OBJECT_0) && \ + ((statusp == NULL) ? 0 : \ + (GetExitCodeThread((thr), (LPDWORD)(statusp)) ? 0 : -1))) + +#define PTHREAD_MUTEX_INITIALIZER {(void*)-1,-1,0,0,0,0} +#define pthread_mutex_lock(pobject) WaitForSingleObject(*pobject,INFINITE) +#define pthread_mutex_unlock(pobject) ReleaseMutex(*pobject) +#define pthread_mutex_init(pobject,pattr) (*pobject=CreateMutex(NULL,FALSE,NULL)) +#define pthread_mutex_destroy(pobject) CloseHandle(*pobject) + +/* Windows doesn't have this, so declare it ourselves. */ +// vs2022 has this struct +//typedef struct timespec { +// /* long long in windows is the same as long in unix for 64bit */ +// long long tv_sec; +// long long tv_nsec; +//} timespec; + +typedef struct { + int waiters_count_; + // Number of waiting threads. + + CRITICAL_SECTION waiters_count_lock_; + // Serialize access to . + + HANDLE sema_; + // Semaphore used to queue up threads waiting for the condition to + // become signaled. + + HANDLE waiters_done_; + // An auto-reset event used by the broadcast/signal thread to wait + // for all the waiting thread(s) to wake up and be released from the + // semaphore. + + size_t was_broadcast_; + // Keeps track of whether we were broadcasting or signaling. This + // allows us to optimize the code if we're just signaling. +} pthread_cond_t; + +static unsigned long long _pthread_time_in_ms(void) +{ + struct __timeb64 tb; + + _ftime64(&tb); + + return tb.time * 1000 + tb.millitm; +} + +static unsigned long long _pthread_time_in_ms_from_timespec(const struct timespec *ts) +{ + unsigned long long t = ts->tv_sec * 1000; + t += ts->tv_nsec / 1000000; + + return t; +} + +static unsigned long long _pthread_rel_time_in_ms(const struct timespec *ts) +{ + unsigned long long t1 = _pthread_time_in_ms_from_timespec(ts); + unsigned long long t2 = _pthread_time_in_ms(); + + /* Prevent underflow */ + if (t1 < t2) return 0; + return t1 - t2; +} + +static int pthread_cond_init(pthread_cond_t *cv, const pthread_condattr_t * ignore) +{ + cv->waiters_count_ = 0; + cv->was_broadcast_ = 0; + cv->sema_ = CreateSemaphore(NULL, // no security + 0, // initially 0 + 0x7fffffff, // max count + NULL); // unnamed + if (cv->sema_ == NULL) + return GetLastError(); + + InitializeCriticalSection(&cv->waiters_count_lock_); + cv->waiters_done_ = CreateEvent(NULL, // no security + FALSE, // auto-reset + FALSE, // non-signaled initially + NULL); // unnamed + return (cv->waiters_done_ == NULL) ? GetLastError() : 0; +} + +static int pthread_cond_destroy(pthread_cond_t *cond) +{ + CloseHandle(cond->sema_); + DeleteCriticalSection(&cond->waiters_count_lock_); + return (CloseHandle(cond->waiters_done_) == 0) ? GetLastError() : 0; +} + +static int pthread_cond_signal(pthread_cond_t *cv) +{ + int have_waiters; + EnterCriticalSection(&(cv->waiters_count_lock_)); + have_waiters = cv->waiters_count_ > 0; + LeaveCriticalSection(&cv->waiters_count_lock_); + + // If there aren't any waiters, then this is a no-op. + if (have_waiters){ + return (ReleaseSemaphore(cv->sema_, 1, 0) == 0) ? GetLastError() : 0; + } + else + return 0; +} + +static int pthread_cond_broadcast(pthread_cond_t *cv) +{ + // This is needed to ensure that and are + // consistent relative to each other. + int have_waiters = 0; + EnterCriticalSection(&cv->waiters_count_lock_); + + if (cv->waiters_count_ > 0) { + // We are broadcasting, even if there is just one waiter... + // Record that we are broadcasting, which helps optimize + // for the non-broadcast case. + cv->was_broadcast_ = 1; + have_waiters = 1; + } + + if (have_waiters) { + // Wake up all the waiters atomically. + ReleaseSemaphore(cv->sema_, cv->waiters_count_, 0); + + LeaveCriticalSection(&cv->waiters_count_lock_); + + // Wait for all the awakened threads to acquire the counting + // semaphore. + WaitForSingleObject(cv->waiters_done_, INFINITE); + // This assignment is okay, even without the held + // because no other waiter threads can wake up to access it. + cv->was_broadcast_ = 0; + } + else + LeaveCriticalSection(&cv->waiters_count_lock_); + + return 0; +} + +static int pthread_cond_wait(pthread_cond_t *cv, pthread_mutex_t *external_mutex) +{ + int last_waiter; + // Avoid race conditions. + EnterCriticalSection(&cv->waiters_count_lock_); + cv->waiters_count_++; + LeaveCriticalSection(&cv->waiters_count_lock_); + + // This call atomically releases the mutex and waits on the + // semaphore until or + // are called by another thread. + SignalObjectAndWait(*external_mutex, cv->sema_, INFINITE, FALSE); + + // Reacquire lock to avoid race conditions. + EnterCriticalSection(&cv->waiters_count_lock_); + + // We're no longer waiting... + cv->waiters_count_--; + + // Check to see if we're the last waiter after . + last_waiter = cv->was_broadcast_ && cv->waiters_count_ == 0; + + LeaveCriticalSection(&cv->waiters_count_lock_); + + // If we're the last waiter thread during this particular broadcast + // then let all the other threads proceed. + if (last_waiter) + // This call atomically signals the event and waits until + // it can acquire the . This is required to ensure fairness. + SignalObjectAndWait(cv->waiters_done_, *external_mutex, INFINITE, FALSE); + else + // Always regain the external mutex since that's the guarantee we + // give to our callers. + WaitForSingleObject(*external_mutex, INFINITE); + + return 0; +} + +#else // linux + +#include + +#endif \ No newline at end of file