diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7bae4b335..f250a6600 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -153,7 +153,7 @@ jobs: # In any case Debug, at the CMake script (in meta-project ./, and in flow/, ipc_*/) level, # means LTO will be ignored (only *Rel* build-types enable LTO, if so instructed). # Still keeping this here to make that clear to the reader/maintainer. Could remove it though. - no-lto: true + no-lto: True - id: release conan-profile-build-type: Release conan-profile-jemalloc-build-type: Release @@ -166,7 +166,7 @@ jobs: # we can use a test of non-LTO building. # TODO: Perhaps this should be a separate matrix dimension (LTO on, LTO off). Number of configs will # jump up, but it is more methodical and nice. - no-lto: true + no-lto: True - id: minsizerel conan-profile-build-type: MinSizeRel conan-profile-jemalloc-build-type: Release @@ -193,7 +193,7 @@ jobs: # At least ASAN with clang + LTO => cryptic link error. # Regardless regular RelWithDebInfo already has no-lto=true; so we would follow suit. Just be aware # that changing it to false for whatever reason => ASAN probably breaks. - no-lto: true + no-lto: True - id: relwithdebinfo-ubsan conan-profile-build-type: RelWithDebInfo conan-profile-jemalloc-build-type: Release @@ -214,7 +214,7 @@ jobs: sanitizer-name: ubsan # While UBSAN might work with LTO, I do not want the aggravation/entropy. Turn it off. # Plus inheriting from regular RelWithDebInfo anyway. - no-lto: true + no-lto: True - id: relwithdebinfo-tsan conan-profile-build-type: RelWithDebInfo conan-profile-jemalloc-build-type: Release @@ -247,14 +247,13 @@ jobs: Borrower_shm_pool_collection_test.Multiprocess:\ Shm_pool_collection_test.Interface:\ Shm_pool_collection_test.Multiprocess:\ - Shm_session_data_test.Multithread_object_database:\ - Jemalloc_shm_pool_collection_DeathTest.Interface + Shm_session_data_test.Multithread_object_database sanitizer-name: tsan # While TSAN might work with LTO, I do not want the aggravation/entropy. Turn it off. # Also, for some clangs, there are TSAN WARNINGs at times about too-small symbolizer buffer or something; # throwing LTO into the mix seems like unnecessary entropy. # Plus inheriting from regular RelWithDebInfo anyway. - no-lto: true + no-lto: True - id: relwithdebinfo-msan conan-profile-build-type: RelWithDebInfo conan-profile-jemalloc-build-type: Release @@ -275,7 +274,7 @@ jobs: sanitizer-name: msan # While MSAN might work with LTO, I do not want the aggravation/entropy. Turn it off. # Plus inheriting from regular RelWithDebInfo anyway. - no-lto: true + no-lto: True # We concentrate on clang sanitizers; they are newer/nicer; also MSAN is clang-only. So gcc ones excluded. # Attention! Excluding some sanitizer job(s) (with these reasons): @@ -300,9 +299,14 @@ jobs: # So this status quo is pretty good. TODO: Do the work/get MSAN functional/useful; un-exclude it then. # - *SAN with gcc: We concentrate on clang sanitizers; they are newer/nicer; having to worry about differences # between them is an excessive burden. So excluding gcc ASAN/UBSAN/TSAN. - # - TODO: Consider reducing to the newest clang. Generally they just keep improving; the chances of - # something being uncaught (incorrectly) with a later version are slim. Not impossible though. Look - # into it. + # - TODO: Consider reducing to the newest or most stable clang. Generally they just keep improving; the + # chances of something being uncaught (incorrectly) with a later version are slim. Not impossible though. + # Look into it. Update: As of this writing transport_test exercise mode/SHM-jemalloc sub-mode is + # disabled for TSAN clang-17 due to instability of TSAN itself. So clearly when it comes to TSAN + # (which is officially in beta as of clang-17), higher version does not mean everything is better. + # Hence perhaps this to-do is more of a longer-term thing, when all the sanitizers stabilize, or when + # we find a single very-stable config. Just remember we aren't testing the sanitizer or our code's + # ability to be sanitized; we just want to detect any problems in the code -- however we get there. # - *SAN with clang-13 (but not 15+): As of this writing clang-13 produces some additional warnings, # at least in TSAN, which appear to be related to nearby non-race messages like # `==75454==WARNING: Symbolizer buffer too small`. As a result, we either have to eliminate the @@ -477,6 +481,32 @@ jobs: yaml.dump(data, file) " + # Important info/somewhat important TODO: The C[XX]FLAGS and linker-flags are supplied via + # ${{ matrix.build-test-cfg.conan-profile-custom-conf }} and + # ${{ matrix.build-test-cfg.conan-profile-custom-buildenv }}. These affect not just our objects/libs/executables; + # but also at least: 3rd party libs (jemalloc, capnp/kj, boost, gtest), 3rd party executables + # (capnp compiler binary; temp binaries created by auto-tools configure scripts to test features). + # In the case of the libs that's usually good; in particular things like + # -fsanitize=address (ASAN) (for build-type relwithdebinfo-asan) ideally are applied to all built code + # uniformly. (There are other libs being built, one can see: at least openssl and zlib; they're not really + # involved in our actual product, at least not in a core way, so for those it arguably matters less either way.) + # In the case of the binaries it is usually bad; we want the fastest, most normal tools possible, not ones + # built weirdly with whatever settings we're trying to test with our own software. For one it makes those + # tools slower -- perhaps not the hugest deal in practice here -- but it also generally increases entropy; and + # it has also resulted in various pain: + # - Cannot use -fno-sanitize-recover with UBSAN (abort program on warning) if we wanted to: it causes some + # hidden configure-script-generated binary abort => capnp binary build fails. + # - capnp binary fails MSAN at startup; hence capnp-compilation of our .capnp schemas fails. + # We have worked around all these, so the thing altogether works. It's just somewhat odd and entropy-ridden; + # and might cause maintanability problems over time, as it has already in the past. + # The TODO is to be more judicious about it + # and only apply these things to the libs/executables we want it applied. It is probably not so simple; + # but worst-case it should be possible to use something like build-type-cflags-override to target our code; + # and per-recipe (Boost, jemalloc, gtest...) techniques to target others; and not use the aggressive + # conan-profile-custom-conf and conan-profile-custom-buildenv. That said, there will be interesting subtleties + # like: libcapnp/kj are linked into capnp compiler binary; *and* to our code. So we should instrument it with + # sanitizer (when applicable); now some of the capnp-compiler-binary is instrumented; some isn't. Would that + # work, or would it be better to build capnp twice then (once for the binary, once for the linked libraries)? - name: Create Conan profile run: | cat <<'EOF' > conan_profile @@ -507,9 +537,12 @@ jobs: ipc:doc = False ipc:build = True EOF - if [ "${{ matrix.build-test-cfg.no-lto }}" != '' ]; then + if [ "${{ matrix.build-test-cfg.no-lto }}" != '' ] && [ "${{ matrix.build-test-cfg.no-lto }}" != 'False' ]; then echo 'ipc:build_no_lto = True' >> conan_profile fi + if [ "${{ matrix.build-test-cfg.build-type-cflags-override }}" != '' ]; then + echo 'ipc:build_type_cflags_override = ${{ matrix.build-test-cfg.build-type-cflags-override }}' >> conan_profile + fi # We need to prepare a sanitizer ignore-list in MSAN mode. Background for this is subtle and annoying: # As it stands, whatever matrix compiler/build-type is chosen applies not just to our code (correct) @@ -784,24 +817,22 @@ jobs: # Script created by pipeline during job. echo "Log level: [$1]." echo "Exercise sub-mode: [$2]." - echo "Sub-mode snippet (none or 'shm-?'): [$3]." + echo "Sub-mode snippet (none or '-shm-?'): [$3]." OUT_DIR=${{ env.install-dir }}/bin/logs/transport_test/exercise/$2 mkdir -p $OUT_DIR SUPP_DIR_A=${{ github.workspace }}/flow/src - { cat $SUPP_DIR_A/${{ env.san-suppress-cfg-in-file1 }} $SUPP_DIR_A/${{ env.san-suppress-cfg-in-file2 }} \ - > ${{ env.san-suppress-cfg-file }} 2> /dev/null; } || true if [ "$3" == '-shm-j' ]; then - # SHM-jemalloc mode => jemalloc is in fact exercised => suppress jemalloc stuff. (UBSAN, TSAN) + # SHM-jemalloc mode => jemalloc is in fact exercised => suppress libjemalloc stuff. + # But, nicely, no transport_test-specific suppressions needed as of this writing. SUPP_DIR_B=${{ github.workspace }}/ipc_shm_arena_lend/src - { cat $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file1 }} $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file2 }} \ - >> ${{ env.san-suppress-cfg-file }} 2> /dev/null; } || true elif [ "$3" == '-shm-c' ]; then - # SHM-classic mode => TSAN gets confused by opposing process (e.g., client) allocating while current process - # (e.g., accordingly, server) deallocates => false-positives when accessing SHM-mapped areas. (TSAN) SUPP_DIR_B=${{ github.workspace }}/test/suite/transport_test/shm-c - { cat $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file1 }} $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file2 }} \ - >> ${{ env.san-suppress-cfg-file }} 2> /dev/null; } || true + else # if [ "$3" == '' ]; then + SUPP_DIR_B=${{ github.workspace }}/test/suite/transport_test/heap fi + { cat $SUPP_DIR_A/${{ env.san-suppress-cfg-in-file1 }} $SUPP_DIR_A/${{ env.san-suppress-cfg-in-file2 }} \ + $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file1 }} $SUPP_DIR_B/${{ env.san-suppress-cfg-in-file2 }} \ + > ${{ env.san-suppress-cfg-file }} 2> /dev/null; } || true ${{ env.setup-tests-env }} ~/bin/ex_srv.exec exercise-srv$3 $OUT_DIR/srv.log info $1 \ > $OUT_DIR/srv.console.log 2>&1 & @@ -851,10 +882,36 @@ jobs: ${{ env.install-dir }}/bin/run_transport_test_ex.sh \ data shm_classic_log_level_data -shm-c + # Disabling this particular test run for the specific case of clang-17 in TSAN (thread sanitizer) config + # (in particular at least 2 other clangs+TSAN are exercised, so the TSAN coverage is still good). + # First the reason in detail: This run always fails at this point in the server binary: + # 2023-12-20 11:36:11.322479842 +0000 [info]: Tguy: ex_srv.hpp:send_req_b(1428): App_session [0x7b3800008180]: + # Chan B[0]: Filling/send()ing payload (description = [reuse out-message + SHM-handle to modified (unless + # SHM-jemalloc) existing STL data]; alt-payload? = [0]; reusing msg? = [1]; reusing SHM payload? = [1]). + # LLVM ERROR: Sections with relocations should have an address of 0 + # PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. + # Stack dump: + # 0. Program arguments: /usr/bin/llvm-symbolizer-17 --demangle --inlines --default-arch=x86_64 + # Stack dump without symbol names (ensure you have llvm-symbolizer in your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point to it): + # ... + # ==77990==WARNING: Can't read from symbolizer at fd 599 + # 2023-12-20 11:36:31.592293322 +0000 [info]: Tguy: ex_srv.hpp:send_req_b(1547): App_session [0x7b3800008180]: Chan B[0]: Filling done. Now to send. + # Sometimes the exact point is different, depending on timing; but in any case it is always the above + # TSAN/LLVM error, at which point the thread gets stuck for a long time (10+ seconds); but eventually gets + # unstuck; however transport_test happens to be testing a feature in a certain way so that a giant blocking + # operation in this thread delays certain processing, causes an internal timeout, and the test exits/fails. + # Sure, we could make some changes to the test for that to not happen, but that's beside the point: TSAN + # at run-time is trying to do something and fails terribly; I have no wish to try to work around that situation; + # literally it says "PLEASE submit a bug report [to clang devs]." + # + # TODO: Revisit; figure out how to not trigger this; re-enable. For the record, I (ygoldfel) cannot reproduce + # in a local clang-17, albeit with libc++ (LLVM STL) instead of libstdc++ (GNU STL). I've also tried to + # reduce optimization to -O1, as well as with and without LTO; same result. - name: Run integration test [transport_test - Exercise mode - SHM-jemalloc sub-mode] id: transport_test_ex_shm_j if: | - !cancelled() + (!cancelled()) && ((matrix.compiler.id != 'clang-17') || + (matrix.build-test-cfg.sanitizer-name != 'tsan')) run: | /usr/bin/bash -e \ ${{ env.install-dir }}/bin/run_transport_test_ex.sh \ diff --git a/conanfile.py b/conanfile.py index 241470ca2..e5eee74ce 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,12 +9,25 @@ class IpcRecipe(ConanFile): options = { "build": [True, False], "build_no_lto": [True, False], + # Replaces the core C/C++ compiler flags (not linker flags) for the chosen settings.build_type. + # Note that these appear *after* any C[XX]FLAGS and tools.build.c[xx]flags + # on the compiler command line, so it would not be + # sufficient to instead add the desired flags to tools.build* or *FLAGS, as if a setting in + # the core CMake-chosen flags conflicts with one of those, the core one wins due to being last on command + # line. Long story short, this is for the core flags, typically: -O [-g] [-DNDEBUG]. + # So default for, e.g., RelWithDebInfo in Linux = -O2 -g -DNDEBUG; and one could set + # this option to "-O3 -g -DNDEBUG" to increase the optimization level. + # + # This affects `ipc` CMake only; meaning flow, ipc_*, ipc objects will have this overridden; while + # Boost libs, jemalloc lib, capnp/kj libs will build how they would've built anyway. + "build_type_cflags_override": "ANY", "doc": [True, False], } default_options = { "build": True, "build_no_lto": False, + "build_type_cflags_override": "", "doc": False, } @@ -26,6 +39,7 @@ def configure(self): def generate(self): deps = CMakeDeps(self) if self.options.doc: + # TODO: This magic version number is repeated several times. Code reuse. deps.build_context_activated = ["doxygen/1.9.4"] deps.generate() @@ -38,6 +52,11 @@ def generate(self): if self.options.doc: toolchain.variables["CFG_ENABLE_DOC_GEN"] = "ON" toolchain.variables["CFG_SKIP_CODE_GEN"] = "ON" + if self.options.build_type_cflags_override: + suffix = str(self.settings.build_type).upper() + toolchain.variables["CMAKE_CXX_FLAGS_" + suffix] = self.options.build_type_cflags_override + toolchain.variables["CMAKE_C_FLAGS_" + suffix] = self.options.build_type_cflags_override + toolchain.generate() def build(self): diff --git a/flow b/flow index df3bead9b..caf751b40 160000 --- a/flow +++ b/flow @@ -1 +1 @@ -Subproject commit df3bead9b3796330f7555b4f239f17ce7336e4d3 +Subproject commit caf751b403e9adfccae35894965f5eae2cbc1492 diff --git a/ipc_shm_arena_lend b/ipc_shm_arena_lend index 277de175b..f6b89b851 160000 --- a/ipc_shm_arena_lend +++ b/ipc_shm_arena_lend @@ -1 +1 @@ -Subproject commit 277de175b557acf2f3628225a94ce6bf88d82712 +Subproject commit f6b89b851b66e7750966a4d7317280d003274e17 diff --git a/src/doc/manual/c-setup.dox.txt b/src/doc/manual/c-setup.dox.txt index d71aede79..a58dfe795 100644 --- a/src/doc/manual/c-setup.dox.txt +++ b/src/doc/manual/c-setup.dox.txt @@ -30,9 +30,13 @@ OS and build environment ------------------------ @note In the short term (as of October 2023) this may become an open-source library with potentially wide distribution. Some of the below items may change in that case; for example it may become a header-only library or have that mode, and it may support additional OS rather than Linux. We would also include specific information on building it from source as befits a wide-release project. Until then we leave out specific instructions on building the library itself as outside the scope of the present document; while listing the environmental requirements/recommendations as follows. It is vaguely informational; until this is a wide-release library we stay away from rigorous build instructions here. Even once it becomes a wide-release product, possibly such instructions shall live outside this manual. Looking outside the src/ directory you'll certainly find the relevant build scripts which cover all that. -This is a Linux library, as of this writing to be built in 64-bit mode (x86-64 a/k/a AMD64). (Most code by far is not architecture-specific, but certain aspects of the optionally-used SHM-jemalloc component are.) It is generally intended for use in a deployed server setting. As of this writing it relies on some Linux-specific techniques such as /proc/...pid.../ and /dev/shm/ semantics. In the future it is quite conceivable it could be extended to other OS and architectures -- possibly even Windows, but definitely MacOS/Darwin and FreeBSD -- but currently that is not a specific aim/priority of the project, as we concentrate on Linux. +This is a Linux library (actually set of libraries, but in this Manual we treat Flow-IPC as a monolithic whole for simplicity). As of this writing it is to be built in 64-bit mode (x86-64 a/k/a AMD64). (Most code by far is not OS/architecture-specific, but at least certain aspects of the optionally-used SHM-jemalloc component are.) It is generally intended for use in a deployed server setting. As of this writing it relies on some Linux-specific techniques such as /proc/...pid.../ and /dev/shm/ semantics. In the future it is quite conceivable it could be extended to other OS and architectures -- possibly even Windows, but definitely MacOS/Darwin and FreeBSD -- but currently that is not a specific aim/priority of the project, as we concentrate on Linux. -It is a linked library that must be built in **C++17** compiler mode, as must any of its `#include`rs (translation units). It has been specifically tested with gcc-8 and gcc-9 as of this writing. common.hpp will `#error` out if an `#include`r is not being built with a supported compiler; and will `#warning` for compilers that would probably work but have not been tested, such as gcc below gcc-8. (Support for clang is likely attainable and may arrive in the short term, if it has not already. Again: look outside src/ for build scripts and related instructions if any.) +It is a linked library (libraries) that must be built in **C++17** compiler mode, as must any of its `#include`rs (translation units). It has been specifically tested (as of this writing) on a range of gcc and clang compilers/linkers. We omit details in this text, since this is likely to evolve over time. Generally such topics are well covered outside of src/ directories; start with the top-level `README.md` for an overview; it will point you to CMake scripts and so on -- if there is interest. + +Regarding the briefly-mentioned-above testing: As of this writing there is an automated CI/CD test suite which checks the various combinations of compiler and build-type (debug, release, various sanitizers, etc.) -- both building and functionality. Again we omit details here; but as of this writing you can see the details at the official open-source project (GitHub organization "Flow-IPC," as I write this). You'll likely find a few files like `.github/...`, `conanfile.py`, and sanitizer-specific `.cfg` files which control this stuff. This is well outside our scope in this Manual, but we wanted you to be aware of such things. + +Regarding test code itself: We do not talk about it in this Manual or Reference (again: out of scope); but it is in the same repo (repos). Some test code (of the unit-test variety) is in `test/` subdirs at various levels near production code; other test code is outside `src/` entirely -- whether the unit test driver program(s) or the various integration-tests. Flow-IPC strongly relies on a foundational library named Flow. We expose certain aspects of Flow, most notably for logging (discussed below), through the API as well. Informally and orthogonally, Flow is a nice tool we feel you're likely to find useful in any case at times. (For capnp users: an analogy might be capnp's use of their in-house `kj` library both internally and at times exposed in capnp's own API.) In any case you'll need to link and build with Flow to use Flow-IPC. @@ -40,7 +44,9 @@ Flow has some dependencies, notably Boost, including certain linked (non-header- capnp (Cap'n Proto) is an important dependency; you must build and link with capnp to use Flow-IPC. In particular that one, at the version we require, requires all `#include`rs to be built in C++17 mode. Flow-IPC inherits this requirement from capnp (as well as Flow) and is not shy about using C++17 including in .hpp files. -All in all, if you've got a reasonably modern Linux build and runtime environment, your application is built in at least C++17 mode, then there's nothing too exotic going on that would prevent the use of Flow-IPC (or Flow). Boost and capnp do not introduce any array of exotic dependencies. +Lastly, if one chooses to use the SHM-jemalloc provider of zero-copy transmission and/or direct SHM use, then the jemalloc library is a dependency as well. Building jemalloc from source is a fairly simple task (a topic applicable if your consumer program does not already use jemalloc as a `malloc()` provider or other reasons). + +All in all, if you've got a reasonably modern Linux build and runtime environment, and your application is built in at least C++17 mode, then there's nothing too exotic going on that would prevent the use of Flow-IPC (or Flow). Boost, capnp, and jemalloc do not introduce any array of exotic dependencies. Error reporting --------------- diff --git a/test/suite/transport_test/ex_guy.hpp b/test/suite/transport_test/ex_guy.hpp index f885c1d4b..0ed82fa8f 100644 --- a/test/suite/transport_test/ex_guy.hpp +++ b/test/suite/transport_test/ex_guy.hpp @@ -35,7 +35,7 @@ ( \ if (!(expr)) \ { \ - std::cerr << FLOW_UTIL_WHERE_AM_I() << "Condition failed; aborting. The condition: [" << #expr << "].\n"; \ + std::cerr << FLOW_UTIL_WHERE_AM_I() << ": Condition failed; aborting. The condition: [" << #expr << "].\n"; \ std::abort(); \ } \ ) diff --git a/test/suite/transport_test/heap/sanitize/tsan/suppressions_clang.cfg b/test/suite/transport_test/heap/sanitize/tsan/suppressions_clang.cfg new file mode 100644 index 000000000..2c859002f --- /dev/null +++ b/test/suite/transport_test/heap/sanitize/tsan/suppressions_clang.cfg @@ -0,0 +1,71 @@ +# Current version assumption: clang-15/16/17. + +# Had some issues matching ^ and $ in this one; leaving them out; these are very unlikely to match something +# unintentional. + +# This is for transport_test exercise mode heap sub-mode. + +# I have seen this warning exactly once as of this writing, with clang-15. Upon careful analysis: it's got to be +# a false positive. It looks like this: +# WARNING: ThreadSanitizer: data race (pid=75301) +# Write of size 8 at 0x7ba8000006f0 by thread T79: +# #0 close (ex_srv.exec+0xa71c5) +# #1 boost::asio::detail::eventfd_select_interrupter::close_descriptors() +# .../include/boost/asio/detail/impl/eventfd_select_interrupter.ipp:106:5 (ex_srv.exec+0x1684e1) +# #2 boost::asio::detail::eventfd_select_interrupter::~eventfd_select_interrupter() +# .../include/boost/asio/detail/impl/eventfd_select_interrupter.ipp:98:3 (ex_srv.exec+0x1684e1) +# #3 boost::asio::detail::epoll_reactor::~epoll_reactor() +# .../include/boost/asio/detail/impl/epoll_reactor.ipp:72:1 (ex_srv.exec+0x1684e1) +# #4 boost::asio::detail::epoll_reactor::~epoll_reactor() +# .../include/boost/asio/detail/impl/epoll_reactor.ipp:67:1 (ex_srv.exec+0x168525) +# ... +# #16 ipc::transport::sync_io::Native_socket_stream::operator=(ipc::transport::sync_io::Native_socket_stream&&) +# /home/runner/work/ipc/ipc/ipc_core/src/ipc/transport/sync_io/native_socket_stream.cpp:36:77 (ex_srv.exec+0xd14d14) +# #17 ipc::session::Server_session_impl<...>::create_channel_and_resources(...)::'lambda'()::operator()() const +# /home/runner/work/ipc/ipc/ipc_session/src/ipc/session/detail/server_session_impl.hpp:1302:28 (ex_srv.exec+0x2ae130) +# ... +# Previous read of size 8 at 0x7ba8000006f0 by thread T2 (mutexes: write M0): +# #0 epoll_ctl (ex_srv.exec+0xa8086) +# #1 boost::asio::detail::epoll_reactor::deregister_descriptor(int, boost::asio::detail::epoll_reactor::descriptor_state*&, bool) +# .../include/boost/asio/detail/impl/epoll_reactor.ipp:403:7 (ex_srv.exec+0x18a89e) +# #2 boost::asio::detail::reactive_descriptor_service::release(boost::asio::detail::reactive_descriptor_service::implementation_type&) .../include/boost/asio/detail/impl/reactive_descriptor_service.ipp:174:14 (ex_srv.exec+0x18f816) +# #3 boost::asio::posix::basic_descriptor::release() +# .../include/boost/asio/posix/basic_descriptor.hpp:381:32 (ex_srv.exec+0x18f816) +# #4 ipc::util::sync_io::Asio_waitable_native_handle::~Asio_waitable_native_handle() +# /home/runner/work/ipc/ipc/ipc_core/src/ipc/util/sync_io/asio_waitable_native_hndl.cpp:53:3 (ex_srv.exec+0xd1f805) +# #5 ipc::transport::sync_io::Native_socket_stream::Impl::~Impl() +# /home/runner/work/ipc/ipc/ipc_core/src/ipc/transport/sync_io/detail/native_socket_stream_impl.cpp:197:1 (ex_srv.exec+0xd4e7f5) +# ... +# #34 ipc::transport::test::Ex_srv<...>::App_session::~App_session() +# /home/runner/work/ipc/ipc/test/suite/transport_test/ex_srv.hpp:64:9 (ex_srv.exec+0x26fcd2) +# ... +# Location is file descriptor 74 created by thread T79 at: +# #0 eventfd (ex_srv.exec+0xa66cc) +# #1 boost::asio::detail::eventfd_select_interrupter::open_descriptors() +# .../include/boost/asio/detail/impl/eventfd_select_interrupter.ipp:59:5 (ex_srv.exec+0x169545) +# ... +# #20 ipc::transport::sync_io::Native_socket_stream::Native_socket_stream() +# /home/runner/work/ipc/ipc/ipc_core/src/ipc/transport/sync_io/native_socket_stream.cpp:55:3 (ex_srv.exec+0xd14e3e) +# #21 ipc::session::Server_session_impl<...>::create_channel_and_resources(...) +# /home/runner/work/ipc/ipc/ipc_session/src/ipc/session/detail/server_session_impl.hpp:1275:24 (ex_srv.exec+0x2aba3e) +# ... +# Thread T79 '0x7b7c00012600]' (tid=75474, running) created by thread T5 at: +# ... +# Thread T2 'guy' (tid=75305, running) created by main thread at: +# ... +# SUMMARY: ThreadSanitizer: data race (/home/runner/bin/ex_srv.exec+0xa71c5) in close +# It is worried this FD is being close()d -- after *just* being created (synchronously even!) in that same thread, +# within Server_session_impl::create_channel_and_resources() -- which indeed happens: a blank Native_socket_stream, +# with an unused Task_engine (boost::asio::io_context), including its epoll FD, is quickly replaced by a real one +# storing a native handle from a ::socketpair() call; so the io_context+FD is created/opened and destroyed/closed. +# The thread is that session's Server_session_impl m_async_worker. It thinks that same FD was being messed with in a +# different thread -- namely 'guy' (from Ex_srv test class) -- when it was epoll_ctl()ed. But that was a different +# Server_session_impl which was just closed. It might be the same FD, numerically, but it was closed and then +# opened again; there is plenty of synchronization between those unrelated operations. Not sure why TSAN got confused; +# would need to understand its algorithm in more detail; but I (ygoldfel) know this particular code quite well; it +# doesn't make sense to me that this is unsynchronized access to the same FD. +# TODO: Try to reproduce and debug. Won't be easy; again I've only seen it once. Do note that the test completed +# fine even then (but that doesn't in itself prove there's no race of course). +# +# In the meantime suppress the issue at a few frames up from the reported problem. Hopefully that takes care of it. +race:boost::asio::detail::epoll_reactor::~epoll_reactor diff --git a/test/suite/transport_test/shm-c/sanitize/tsan/suppressions_clang.cfg b/test/suite/transport_test/shm-c/sanitize/tsan/suppressions_clang.cfg index 22b03ad14..b7a9d0541 100644 --- a/test/suite/transport_test/shm-c/sanitize/tsan/suppressions_clang.cfg +++ b/test/suite/transport_test/shm-c/sanitize/tsan/suppressions_clang.cfg @@ -1,3 +1,5 @@ +# Current version assumption: clang-15/16/17. + # This is for transport_test exercise mode SHM-classic sub-mode. Cf. exercise mode heap sub-mode: It passes cleanly # with ~no suppressions. Yet SHM-classic sub-mode spams quite a few TSAN warnings. Investigating them shows it is # *always* when reading or writing in SHM; and looking in more detail shows there is no real race; TSAN just cannot @@ -23,12 +25,12 @@ # we've specifically seen -- but not too much more. # offset_ptr used (in our context at least) only for SHM work, so catch anything in its code: will be false-positive. -race:boost::interprocess::offset_ptr<* +race:^boost::interprocess::offset_ptr< # Similarly anything capnp-generated or from libcapnp like the following would be in SHM. (Actually the # session master channel does non-SHM capnp, so there are some things in non-SHM; but those things are TSAN sans # suppression in heap sub-mode at least. So it's not so bad to include it in these suppressions.) Again catch # anything in there as false positive. -race:capnp::_::* +race:^capnp::_:: # There are common warnings from flow::util::Basic_blob>: in size(), in begin(), others. # In every case it's access to a SHM-oriented allocator-parameterized Basic_blob, which is a container type. @@ -36,11 +38,11 @@ race:capnp::_::* # string<>, list<>, etc. In all cases however they'll be shm::stl::Stateless_allocator-parameterized, and in all # cases of the latter it is access to SHM. So it should be ideal to just mention that allocator and wildcard the # rest. -race:*ipc::shm::stl::Stateless_allocator<* +race:ipc::shm::stl::Stateless_allocator< # This is a little too specific for my tastes -- but on the other hand just `boost::intrusive::*` might catch some # non-false-positive warning, theoretically, in non-SHM work. Until we see a false positive in boost::intrusive # that's not this, let's just suppress the exact false positive we have in fact seen. -race:boost::intrusive::detail::size_holder::get_size +race:^boost::intrusive::detail::size_holder::get_size$ # (End SHM-related suppressions.) diff --git a/test/suite/transport_test/shm-j/sanitize/tsan/suppressions_clang.cfg b/test/suite/transport_test/shm-j/sanitize/tsan/suppressions_clang.cfg deleted file mode 100644 index fb24c43f8..000000000 --- a/test/suite/transport_test/shm-j/sanitize/tsan/suppressions_clang.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# This is for transport_test exercise mode SHM-jemalloc sub-mode. -# In clang-15+ at least TSAN has a clean run except for warnings of the type: -# SUMMARY: ThreadSanitizer: data race .../src/include/jemalloc/internal/extent_inlines.h:285:17 in extent_... -# The extent_ function varies, but in all cases it's in a subtree of a `je_arena*()` call; this is inside -# jemalloc. I (ygoldfel) have filed a ticket regarding confirming this (as on the topic of SHM-jemalloc I am not -# the #1 subject matter expert), but IMO: this is very low-level code in the guts of jemalloc, and jemalloc is -# generally safe w/r/t concurrent calls; so most likely TSAN cannot follow what is going on. Also similar -# warnings are observed in unit_test (and a colleague suppressed je_arena* for it similarly). So: -# probably it is a false positive; but ticket has been filed to confirm; in the meantime confidence in TSAN -# coverage (with these suppressions) is high (on my part at least). -race:je_arena* diff --git a/test/suite/unit_test/sanitize/tsan/suppressions_clang.cfg b/test/suite/unit_test/sanitize/tsan/suppressions_clang.cfg index b2ba51ff9..37b2b8b1a 100644 --- a/test/suite/unit_test/sanitize/tsan/suppressions_clang.cfg +++ b/test/suite/unit_test/sanitize/tsan/suppressions_clang.cfg @@ -1,5 +1,8 @@ # Current version assumption: clang-15/16/17. +# Had some issues matching ^ and $ in this one; leaving them out; these are very unlikely to match something +# unintentional. + # TODO: Explain these in detail. race:ipc::shm::arena_lend::Borrower_shm_pool_collection::open_shm_pool race:ipc::session::shm::arena_lend::Shm_session_data::register_borrower_collection diff --git a/test/suite/unit_test/sanitize/tsan/suppressions_clang_15.cfg b/test/suite/unit_test/sanitize/tsan/suppressions_clang_15.cfg index 96f437bf8..f48067eaa 100644 --- a/test/suite/unit_test/sanitize/tsan/suppressions_clang_15.cfg +++ b/test/suite/unit_test/sanitize/tsan/suppressions_clang_15.cfg @@ -1,3 +1,8 @@ +# Current version assumption: clang-15/16/17. + +# Had some issues matching ^ and $ in this one; leaving them out; these are very unlikely to match something +# unintentional. + # TODO: Explain these in detail. race:ipc::session::shm::arena_lend::jemalloc::test::(anonymous namespace)::Test_client::open_app_channel race:ipc::session::shm::arena_lend::jemalloc::test::(anonymous namespace)::Test_client_manager::Client_data::~Client_data diff --git a/test/suite/unit_test/sanitize/tsan/suppressions_clang_16.cfg b/test/suite/unit_test/sanitize/tsan/suppressions_clang_16.cfg index 82ecc0ad8..d286c0166 100644 --- a/test/suite/unit_test/sanitize/tsan/suppressions_clang_16.cfg +++ b/test/suite/unit_test/sanitize/tsan/suppressions_clang_16.cfg @@ -1,3 +1,8 @@ +# Current version assumption: clang-15/16/17. + +# Had some issues matching ^ and $ in this one; leaving them out; these are very unlikely to match something +# unintentional. + # TODO: Explain these in detail. race:ipc::session::shm::arena_lend::jemalloc::test::(anonymous namespace)::Test_client::open_app_channel race:ipc::session::shm::arena_lend::jemalloc::test::(anonymous namespace)::Test_client_manager::~Test_client_manager diff --git a/test/suite/unit_test/sanitize/tsan/suppressions_clang_17.cfg b/test/suite/unit_test/sanitize/tsan/suppressions_clang_17.cfg index 4c4792430..2a4e5334d 100644 --- a/test/suite/unit_test/sanitize/tsan/suppressions_clang_17.cfg +++ b/test/suite/unit_test/sanitize/tsan/suppressions_clang_17.cfg @@ -1,3 +1,8 @@ +# Current version assumption: clang-15/16/17. + +# Had some issues matching ^ and $ in this one; leaving them out; these are very unlikely to match something +# unintentional. + # TODO: Explain these in detail. race:ipc::session::shm::arena_lend::jemalloc::test::(anonymous namespace)::Basic_event_listener::~Basic_event_listener race:ipc::session::shm::arena_lend::jemalloc::test::Test_shm_session_server_executor::many_objects_creator_functor