From d18420ba24652181a9eb963ea79475856ea333a6 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 14 Sep 2024 22:50:04 -0500 Subject: [PATCH 01/10] [ci] [python-package] temporarily stop testing against scikit-learn nightlies --- .ci/test-python-latest.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh index 08fc8558ef3e..04fbdddf573b 100755 --- a/.ci/test-python-latest.sh +++ b/.ci/test-python-latest.sh @@ -22,7 +22,6 @@ python -m pip install \ 'numpy>=2.0.0.dev0' \ 'matplotlib>=3.10.0.dev0' \ 'pandas>=3.0.0.dev0' \ - 'scikit-learn>=1.6.dev0' \ 'scipy>=1.15.0.dev0' python -m pip install \ From 024cfd9084d506dee1a7c812175bdc388b611de4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 14 Sep 2024 23:16:00 -0500 Subject: [PATCH 02/10] still need scikit-learn... just a release --- .ci/test-python-latest.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh index 04fbdddf573b..07c4fb116341 100755 --- a/.ci/test-python-latest.sh +++ b/.ci/test-python-latest.sh @@ -22,6 +22,7 @@ python -m pip install \ 'numpy>=2.0.0.dev0' \ 'matplotlib>=3.10.0.dev0' \ 'pandas>=3.0.0.dev0' \ + 'scikit-learn<1.16.0a0' \ 'scipy>=1.15.0.dev0' python -m pip install \ From fef917e2e77e811047409249c75a3233fde96c3a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 14 Sep 2024 23:38:54 -0500 Subject: [PATCH 03/10] 1.6 not 1.16, ugh --- .ci/test-python-latest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh index 07c4fb116341..0869f62ee705 100755 --- a/.ci/test-python-latest.sh +++ b/.ci/test-python-latest.sh @@ -22,7 +22,7 @@ python -m pip install \ 'numpy>=2.0.0.dev0' \ 'matplotlib>=3.10.0.dev0' \ 'pandas>=3.0.0.dev0' \ - 'scikit-learn<1.16.0a0' \ + 'scikit-learn<1.6.0a0' \ 'scipy>=1.15.0.dev0' python -m pip install \ From 4a0044992f4ab61e29a7f43c73c9dc78dc6b871f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 15 Sep 2024 00:33:21 -0500 Subject: [PATCH 04/10] stricter constraint --- .ci/test-python-latest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh index 0869f62ee705..f98f29f2641a 100755 --- a/.ci/test-python-latest.sh +++ b/.ci/test-python-latest.sh @@ -22,7 +22,7 @@ python -m pip install \ 'numpy>=2.0.0.dev0' \ 'matplotlib>=3.10.0.dev0' \ 'pandas>=3.0.0.dev0' \ - 'scikit-learn<1.6.0a0' \ + 'scikit-learn==1.5.*' \ 'scipy>=1.15.0.dev0' python -m pip install \ From 0229097143f71e5540c7d6f0008c101a31819bca Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 15 Sep 2024 21:54:06 -0500 Subject: [PATCH 05/10] try using pyarrow-core instead of pyarrow --- .ci/conda-envs/ci-core.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/conda-envs/ci-core.txt b/.ci/conda-envs/ci-core.txt index ef04e4df2b3a..a0763580c7f3 100644 --- a/.ci/conda-envs/ci-core.txt +++ b/.ci/conda-envs/ci-core.txt @@ -23,7 +23,7 @@ joblib>=1.3.2 matplotlib-base>=3.7.3 numpy>=1.24.4 pandas>2.0 -pyarrow>=6.0 +pyarrow-core>=6.0 python-graphviz>=0.20.3 scikit-learn>=1.3.2 scipy>=1.1 From 85a16e6cb06316a90daccc21915642cbfdf8ef45 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 23:58:15 -0500 Subject: [PATCH 06/10] try a re-run From 7a9aa7e65dd5b1fcc88fddbd3930467c978ace77 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 21 Sep 2024 01:20:28 -0500 Subject: [PATCH 07/10] dlopen() libgomp as early as possible --- python-package/lightgbm/__init__.py | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 600f71284159..ab45fd89cfc2 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -4,6 +4,51 @@ Contributors: https://github.com/microsoft/LightGBM/graphs/contributors. """ +import platform + +# gcc's libgomp tries to allocate a small amount of aligned static thread-local storage ("TLS") +# when it's dynamically loaded. +# +# If it's not able to find a block of aligned memory large enough, loading fails like this: +# +# > ../lib/libgomp.so.1: cannot allocate memory in static TLS block +# +# On aarch64 Linux, processes and loaded libraries share the same pool of static TLS, +# which makes such failures more likely on that architecture. +# (ref: https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6) +# +# Therefore, the later in a process libgomp.so is loaded, the greater the risk that loading +# it will fail in this way... so lightgbm tries to dlopen() it immediately, before any +# other imports or computation. +# +# This should generally be safe to do ... many other dynamically-loaded libraries have fallbacks +# that allow successful loading if there isn't sufficient static TLS available. +# +# libgomp.so absolutely needing it, by design, makes it a special case +# (ref: https://gcc.gcc.gnu.narkive.com/vOXMQqLA/failure-to-dlopen-libgomp-due-to-static-tls-data). +# +# other references: +# +# * https://github.com/microsoft/LightGBM/pull/6654#issuecomment-2352014275 +# * https://github.com/microsoft/LightGBM/issues/6509 +# * https://maskray.me/blog/2021-02-14-all-about-thread-local-storage +# * https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6 +# +if platform.system().lower() == "linux" and platform.processor().lower() == "aarch64": + import ctypes + + try: + # this seems specific to libgomp, so no need to attempt e.g. libomp or libiomp + _ = ctypes.CDLL("libgomp.so.1", ctypes.RTLD_GLOBAL) + except: # noqa: E722 + # this needs to be try-catched, to handle these situations: + # + # * LightGBM built without OpenMP (-DUSE_OPENMP=OFF) + # * non-gcc OpenMP used (e.g. clang/libomp, icc/libiomp) + # * no file "libgomp.so" available to the linker (e.g. maybe only "libgomp.so.1") + # + pass + from pathlib import Path from .basic import Booster, Dataset, Sequence, register_logger From c4841ae81e790cc5ea394379868a247848323395 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 21 Sep 2024 01:26:47 -0500 Subject: [PATCH 08/10] fix comment --- python-package/lightgbm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index ab45fd89cfc2..a107d3b9ece7 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -38,7 +38,7 @@ import ctypes try: - # this seems specific to libgomp, so no need to attempt e.g. libomp or libiomp + # this issue seems specific to libgomp, so no need to attempt e.g. libomp or libiomp _ = ctypes.CDLL("libgomp.so.1", ctypes.RTLD_GLOBAL) except: # noqa: E722 # this needs to be try-catched, to handle these situations: From 7a8c6d0b8fbbac1c4288c38c4502ade5319b9643 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 22 Sep 2024 01:38:41 -0500 Subject: [PATCH 09/10] load libgomp.so.1 earlier --- docs/FAQ.rst | 43 ++++++++++++++++++++++++++ python-package/lightgbm/__init__.py | 47 ++--------------------------- python-package/lightgbm/basic.py | 45 ++++++++++++--------------- python-package/lightgbm/libpath.py | 14 ++++++++- 4 files changed, 77 insertions(+), 72 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 3917b27a183a..2cec5fee06f6 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -206,6 +206,49 @@ Detailed description of conflicts between multiple OpenMP instances is provided If this is not your case, then you should find conflicting OpenMP library installations on your own and leave only one of them. +17. Loading LightGBM fails like: ``cannot allocate memory in static TLS block`` +------------------------------------------------------------------------------- + +When loading LightGBM, you may encounter errors like the following. + +.. code-block:: console + + lib/libgomp.so.1: cannot allocate memory in static TLS block + +This most commonly happens on aarch64 Linux systems. + +``gcc``'s OpenMP library (``libgomp.so``) tries to allocate a small amount of static thread-local storage ("TLS") +when it's dynamically loaded. + +That error can happen when the loader isn't able to find a large enough block of memory. + +On aarch64 Linux, processes and loaded libraries share the same pool of static TLS, +which makes such failures more likely. See these discussions: + +* https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6 +* https://gcc.gcc.gnu.narkive.com/vOXMQqLA/failure-to-dlopen-libgomp-due-to-static-tls-data + +If you are experiencing this issue when using the ``lightgbm`` Python package, try upgrading +to at least ``v4.6.0``. + +For older versions of the Python package, or for other LightGBM APIs, this issue can +often be avoided by loading ``libgomp.so.1``. That can be done directly by setting environment +variable ``LD_PRELOAD``, like this: + +.. code-block:: console + + export LD_PRELOAD=/root/miniconda3/envs/test-env/lib/libgomp.so.1 + +It can also be done indirectly by changing the order that other libraries are loaded +into processes, which varies by programming language and application type. + +For more details, see these discussions: + +* https://github.com/microsoft/LightGBM/pull/6654#issuecomment-2352014275 +* https://github.com/microsoft/LightGBM/issues/6509 +* https://maskray.me/blog/2021-02-14-all-about-thread-local-storage +* https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6 + ------ R-package diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index a107d3b9ece7..b679b3f665b5 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -4,53 +4,10 @@ Contributors: https://github.com/microsoft/LightGBM/graphs/contributors. """ -import platform - -# gcc's libgomp tries to allocate a small amount of aligned static thread-local storage ("TLS") -# when it's dynamically loaded. -# -# If it's not able to find a block of aligned memory large enough, loading fails like this: -# -# > ../lib/libgomp.so.1: cannot allocate memory in static TLS block -# -# On aarch64 Linux, processes and loaded libraries share the same pool of static TLS, -# which makes such failures more likely on that architecture. -# (ref: https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6) -# -# Therefore, the later in a process libgomp.so is loaded, the greater the risk that loading -# it will fail in this way... so lightgbm tries to dlopen() it immediately, before any -# other imports or computation. -# -# This should generally be safe to do ... many other dynamically-loaded libraries have fallbacks -# that allow successful loading if there isn't sufficient static TLS available. -# -# libgomp.so absolutely needing it, by design, makes it a special case -# (ref: https://gcc.gcc.gnu.narkive.com/vOXMQqLA/failure-to-dlopen-libgomp-due-to-static-tls-data). -# -# other references: -# -# * https://github.com/microsoft/LightGBM/pull/6654#issuecomment-2352014275 -# * https://github.com/microsoft/LightGBM/issues/6509 -# * https://maskray.me/blog/2021-02-14-all-about-thread-local-storage -# * https://bugzilla.redhat.com/show_bug.cgi?id=1722181#c6 -# -if platform.system().lower() == "linux" and platform.processor().lower() == "aarch64": - import ctypes - - try: - # this issue seems specific to libgomp, so no need to attempt e.g. libomp or libiomp - _ = ctypes.CDLL("libgomp.so.1", ctypes.RTLD_GLOBAL) - except: # noqa: E722 - # this needs to be try-catched, to handle these situations: - # - # * LightGBM built without OpenMP (-DUSE_OPENMP=OFF) - # * non-gcc OpenMP used (e.g. clang/libomp, icc/libiomp) - # * no file "libgomp.so" available to the linker (e.g. maybe only "libgomp.so.1") - # - pass - from pathlib import Path +# .basic is intentionally loaded as early as possible, to dlopen() lib_lightgbm.{dll,dylib,so} +# and its dependencies as early as possible from .basic import Booster, Dataset, Sequence, register_logger from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter from .engine import CVBooster, cv, train diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index af4d757f480b..9da553b1a64a 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1,6 +1,13 @@ # coding: utf-8 """Wrapper for C API of LightGBM.""" +# This import causes lib_lightgbm.{dll,dylib,so} to be loaded. +# It's intentionally done here, as early as possible, to avoid issues like +# "libgomp.so.1: cannot allocate memory in static TLS block" on aarch64 Linux. +# +# For details, see the "cannot allocate memory in static TLS block" entry in docs/FAQ.rst. +from .libpath import _LIB # isort: skip + import abc import ctypes import inspect @@ -37,7 +44,6 @@ pd_DataFrame, pd_Series, ) -from .libpath import find_lib_path if TYPE_CHECKING: from typing import Literal @@ -160,6 +166,12 @@ _MULTICLASS_OBJECTIVES = {"multiclass", "multiclassova", "multiclass_ova", "ova", "ovr", "softmax"} +class LightGBMError(Exception): + """Error thrown by LightGBM.""" + + pass + + def _is_zero(x: float) -> bool: return -ZERO_THRESHOLD <= x <= ZERO_THRESHOLD @@ -259,26 +271,13 @@ def _log_callback(msg: bytes) -> None: _log_native(str(msg.decode("utf-8"))) -def _load_lib() -> ctypes.CDLL: - """Load LightGBM library.""" - lib_path = find_lib_path() - lib = ctypes.cdll.LoadLibrary(lib_path[0]) - lib.LGBM_GetLastError.restype = ctypes.c_char_p - callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p) - lib.callback = callback(_log_callback) # type: ignore[attr-defined] - if lib.LGBM_RegisterLogCallback(lib.callback) != 0: - raise LightGBMError(lib.LGBM_GetLastError().decode("utf-8")) - return lib - - -# we don't need lib_lightgbm while building docs -_LIB: ctypes.CDLL +# connect the Python logger to logging in lib_lightgbm if environ.get("LIGHTGBM_BUILD_DOC", False): - from unittest.mock import Mock # isort: skip - - _LIB = Mock(ctypes.CDLL) # type: ignore -else: - _LIB = _load_lib() + _LIB.LGBM_GetLastError.restype = ctypes.c_char_p + callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p) + _LIB.callback = callback(_log_callback) # type: ignore[attr-defined] + if _LIB.LGBM_RegisterLogCallback(_LIB.callback) != 0: + raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8")) _NUMERIC_TYPES = (int, float, bool) @@ -552,12 +551,6 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: self.path.unlink() -class LightGBMError(Exception): - """Error thrown by LightGBM.""" - - pass - - # DeprecationWarning is not shown by default, so let's create our own with higher level # ref: https://peps.python.org/pep-0565/#additional-use-case-for-futurewarning class LGBMDeprecationWarning(FutureWarning): diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index a55e7362ab44..0e6b8425dccb 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -1,6 +1,8 @@ # coding: utf-8 """Find the path to LightGBM dynamic library files.""" +import ctypes +from os import environ from pathlib import Path from platform import system from typing import List @@ -8,7 +10,7 @@ __all__: List[str] = [] -def find_lib_path() -> List[str]: +def _find_lib_path() -> List[str]: """Find the path to LightGBM library files. Returns @@ -35,3 +37,13 @@ def find_lib_path() -> List[str]: dll_path_joined = "\n".join(map(str, dll_path)) raise Exception(f"Cannot find lightgbm library file in following paths:\n{dll_path_joined}") return lib_path + + +# we don't need lib_lightgbm while building docs +_LIB: ctypes.CDLL +if environ.get("LIGHTGBM_BUILD_DOC", False): + from unittest.mock import Mock # isort: skip + + _LIB = Mock(ctypes.CDLL) # type: ignore +else: + _LIB = ctypes.cdll.LoadLibrary(_find_lib_path()[0]) From 48b8f461dc76afccefbb191e36bd2b74b05fc95c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 22 Sep 2024 01:47:01 -0500 Subject: [PATCH 10/10] fix docs condition --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 9da553b1a64a..73c040b7da4e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -272,7 +272,7 @@ def _log_callback(msg: bytes) -> None: # connect the Python logger to logging in lib_lightgbm -if environ.get("LIGHTGBM_BUILD_DOC", False): +if not environ.get("LIGHTGBM_BUILD_DOC", False): _LIB.LGBM_GetLastError.restype = ctypes.c_char_p callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p) _LIB.callback = callback(_log_callback) # type: ignore[attr-defined]