From 7aae5644cceb23932ffef324753aa5dc4705044e Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 10 Jun 2024 13:13:21 +0300 Subject: [PATCH 01/24] grpcio 1.47.5 --- gprofiler/main.py | 25 ++++++++++++------------- granulate-utils | 2 +- requirements.txt | 2 +- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/gprofiler/main.py b/gprofiler/main.py index 3541e020b..32a72b003 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -34,7 +34,6 @@ from granulate_utils.linux.ns import is_running_in_init_pid from granulate_utils.linux.process import is_process_running from granulate_utils.metadata.cloud import get_aws_execution_env -from granulate_utils.metadata.databricks_client import DBXWebUIEnvWrapper, get_name_from_metadata from psutil import NoSuchProcess, Process from requests import RequestException, Timeout @@ -1043,18 +1042,18 @@ def main() -> None: # assume we run in the root cgroup (when containerized, that's our view) usage_logger = CgroupsUsageLogger(logger, "/") if args.log_usage else NoopUsageLogger() - if args.databricks_job_name_as_service_name: - # "databricks" will be the default name in case of failure with --databricks-job-name-as-service-name flag - args.service_name = "databricks" - dbx_web_ui_wrapper = DBXWebUIEnvWrapper(logger) - dbx_metadata = dbx_web_ui_wrapper.all_props_dict - if dbx_metadata is not None: - service_suffix = get_name_from_metadata(dbx_metadata) - if service_suffix is not None: - args.service_name = f"databricks-{service_suffix}" - - if remote_logs_handler is not None: - remote_logs_handler.update_service_name(args.service_name) + # if args.databricks_job_name_as_service_name: + # # "databricks" will be the default name in case of failure with --databricks-job-name-as-service-name flag + # args.service_name = "databricks" + # dbx_web_ui_wrapper = DBXWebUIEnvWrapper(logger) + # dbx_metadata = dbx_web_ui_wrapper.all_props_dict + # if dbx_metadata is not None: + # service_suffix = get_name_from_metadata(dbx_metadata) + # if service_suffix is not None: + # args.service_name = f"databricks-{service_suffix}" + # + # if remote_logs_handler is not None: + # remote_logs_handler.update_service_name(args.service_name) try: logger.info( diff --git a/granulate-utils b/granulate-utils index 163440161..fd530d3ae 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit 1634401619159ad029013a595650d9901eca900b +Subproject commit fd530d3aedc7205c783893a228118b1aaed6e99d diff --git a/requirements.txt b/requirements.txt index 5238d8484..0358ff25c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ ConfigArgParse==1.3 distro==1.7.0 setuptools==65.5.1 # For pkg_resources packaging==23.1 -pyelftools==0.28 +pyelftools~=0.31 curlify==2.2.1 retry==0.9.2 websocket-client==1.3.1 From faa8c66ac777fd23b1ebca3737e3648a8635ccc1 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 10 Jun 2024 13:23:37 +0300 Subject: [PATCH 02/24] grpcio 1.48.2 --- granulate-utils | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/granulate-utils b/granulate-utils index fd530d3ae..df03c033e 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit fd530d3aedc7205c783893a228118b1aaed6e99d +Subproject commit df03c033ecb49648c889bc15542e37be54b7a229 From b24bdf52c2f7423d6df2f856a3dc1813a71b0d1a Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 10 Jun 2024 13:30:14 +0300 Subject: [PATCH 03/24] grpcio 1.62.2 --- granulate-utils | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/granulate-utils b/granulate-utils index df03c033e..1eef42e5c 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit df03c033ecb49648c889bc15542e37be54b7a229 +Subproject commit 1eef42e5cae9d335a5e1f8edc0e7da82327c1a5b From eb94683cc1092689944644a8e49ca88d245d4329 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 10 Jun 2024 14:04:31 +0300 Subject: [PATCH 04/24] gen --- granulate-utils | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/granulate-utils b/granulate-utils index 1eef42e5c..569ed007f 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit 1eef42e5cae9d335a5e1f8edc0e7da82327c1a5b +Subproject commit 569ed007ff6b280236d7600c86ca48255c54e83d From 3f06917d1f4f3a274eea54d33b5ed74bf6820e60 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 10 Jun 2024 17:46:53 +0300 Subject: [PATCH 05/24] identify failing tests --- tests/test_java.py | 60 ++++++++++++++++++------------------ tests/test_profiling_mode.py | 2 +- tests/test_python.py | 4 +-- tests/test_sanity.py | 8 ++--- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/tests/test_java.py b/tests/test_java.py index 620d7352a..6b2f7b8d0 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -118,7 +118,7 @@ def read_ap_version(self: AsyncProfiledProcess) -> str: return version -def test_async_profiler_already_running( +def xtest_async_profiler_already_running( application_pid: int, profiler_state: ProfilerState, assert_collapsed: AssertInCollapsed, @@ -163,7 +163,7 @@ def test_async_profiler_already_running( @pytest.mark.parametrize("in_container", [True]) -def test_java_async_profiler_cpu_mode( +def xtest_java_async_profiler_cpu_mode( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -186,7 +186,7 @@ def test_java_async_profiler_cpu_mode( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["musl"]) -def test_java_async_profiler_musl_and_cpu( +def xtest_java_async_profiler_musl_and_cpu( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -215,7 +215,7 @@ def test_java_safemode_parameters(profiler_state: ProfilerState) -> None: assert "Java version checks are mandatory in --java-safemode" in str(excinfo.value) -def test_java_safemode_version_check( +def xtest_java_safemode_version_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -236,7 +236,7 @@ def test_java_safemode_version_check( assert log_record_extra(log_record)["jvm_version"] == repr(jvm_version) -def test_java_safemode_build_number_check( +def xtest_java_safemode_build_number_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -261,10 +261,10 @@ def test_java_safemode_build_number_check( [ (False, (), False), # default (False, ("-XX:ErrorFile=/tmp/my_custom_error_file.log",), False), # custom error file - (True, (), False), # containerized (other params are ignored) + # (True, (), False), # containerized (other params are ignored) ], ) -def test_hotspot_error_file( +def xtest_hotspot_error_file( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -297,7 +297,7 @@ def start_async_profiler_and_crash(self: AsyncProfiledProcess, *args: Any, **kwa assert profiler._safemode_disable_reason is not None -def test_disable_java_profiling( +def xtest_disable_java_profiling( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -315,7 +315,7 @@ def test_disable_java_profiling( assert "Java profiling has been disabled, skipping profiling of all java process" in caplog.text -def test_already_loaded_async_profiler_profiling_failure( +def xtest_already_loaded_async_profiler_profiling_failure( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -339,7 +339,7 @@ def test_already_loaded_async_profiler_profiling_failure( # test only once; and don't test in container - as it will go down once we kill the Java app. @pytest.mark.parametrize("in_container", [False]) @pytest.mark.parametrize("check_app_exited", [False]) # we're killing it, the exit check will raise. -def test_async_profiler_output_written_upon_jvm_exit( +def xtest_async_profiler_output_written_upon_jvm_exit( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -367,7 +367,7 @@ def delayed_kill() -> None: # test only once @pytest.mark.parametrize("in_container", [False]) -def test_async_profiler_stops_after_given_timeout( +def xtest_async_profiler_stops_after_given_timeout( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -400,7 +400,7 @@ def test_async_profiler_stops_after_given_timeout( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("j9", "OpenJ9"), ("zing", "Zing")]) -def test_sanity_other_jvms( +def xtest_sanity_other_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -425,7 +425,7 @@ def test_sanity_other_jvms( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("eclipse-temurin-latest", "Temurin")]) -def test_sanity_latest_jvms( +def xtest_sanity_latest_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -462,7 +462,7 @@ def simulate_libjvm_delete(application_pid: int) -> None: # test only once. in a container, so that we don't mess up the environment :) @pytest.mark.parametrize("in_container", [True]) -def test_java_deleted_libjvm( +def xtest_java_deleted_libjvm( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -501,7 +501,7 @@ def _filter_record(r: LogRecord) -> bool: pytest.param("ro", [docker.types.Mount(target="/tmpfs", source="", type="tmpfs", read_only=True)], id="ro"), ], ) -def test_java_noexec_or_ro_dirs( +def xtest_java_noexec_or_ro_dirs( tmp_path_world_accessible: Path, # will be used by AP for logs & outputs application_pid: int, extra_application_docker_mounts: List[docker.types.Mount], @@ -569,7 +569,7 @@ def test_java_noexec_or_ro_dirs( @pytest.mark.parametrize("in_container", [True]) -def test_java_symlinks_in_paths( +def xtest_java_symlinks_in_paths( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -616,7 +616,7 @@ def test_java_symlinks_in_paths( @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def test_java_appid_and_metadata_before_process_exits( +def xtest_java_appid_and_metadata_before_process_exits( application_pid: int, assert_collapsed: AssertInCollapsed, monkeypatch: MonkeyPatch, @@ -657,7 +657,7 @@ def start_async_profiler_and_interrupt(self: AsyncProfiledProcess, *args: Any, * @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def test_java_attach_socket_missing( +def xtest_java_attach_socket_missing( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -680,7 +680,7 @@ def test_java_attach_socket_missing( # we know what messages to expect when in container, not on the host Java @pytest.mark.parametrize("in_container", [True]) -def test_java_jattach_async_profiler_log_output( +def xtest_java_jattach_async_profiler_log_output( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -817,7 +817,7 @@ def test_non_java_basename_version( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) -def test_dso_name_in_ap_profile( +def xtest_dso_name_in_ap_profile( application_pid: int, insert_dso_name: bool, profiler_state: ProfilerState, @@ -836,7 +836,7 @@ def test_dso_name_in_ap_profile( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) @pytest.mark.parametrize("libc_pattern", [r"(^|;)\(/.*/libc-.*\.so\)($|;)"]) -def test_handling_missing_symbol_in_profile( +def xtest_handling_missing_symbol_in_profile( application_pid: int, insert_dso_name: bool, libc_pattern: str, @@ -852,7 +852,7 @@ def test_handling_missing_symbol_in_profile( @pytest.mark.parametrize("in_container", [True]) -def test_meminfo_logged( +def xtest_meminfo_logged( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -869,7 +869,7 @@ def test_meminfo_logged( # test that java frames include no semicolon but use a pipe '|' character instead, as implemented by AP @pytest.mark.parametrize("in_container", [True]) -def test_java_frames_include_no_semicolons( +def xtest_java_frames_include_no_semicolons( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -896,7 +896,7 @@ def test_java_frames_include_no_semicolons( # test that async profiler doesn't print anything to applications stdout, stderr streams @pytest.mark.parametrize("in_container", [True]) -def test_no_stray_output_in_stdout_stderr( +def xtest_no_stray_output_in_stdout_stderr( application_pid: int, application_docker_container: Container, monkeypatch: MonkeyPatch, @@ -1094,7 +1094,7 @@ def flush_output_and_stop_async_profiler(self: AsyncProfiledProcess, *args: Any, ), ], ) -def test_collect_default_jvm_flags( +def xtest_collect_default_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1178,7 +1178,7 @@ def test_collect_default_jvm_flags( ), ], ) -def test_collect_cmdline_and_env_jvm_flags( +def xtest_collect_cmdline_and_env_jvm_flags( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1218,7 +1218,7 @@ def test_collect_cmdline_and_env_jvm_flags( @pytest.mark.parametrize("java_cli_flags", ["-XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=95"]) @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def test_collect_flags_unsupported_filtered_out( +def xtest_collect_flags_unsupported_filtered_out( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1260,7 +1260,7 @@ def test_collect_flags_unsupported_filtered_out( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def test_collect_none_jvm_flags( +def xtest_collect_none_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1272,7 +1272,7 @@ def test_collect_none_jvm_flags( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("include_mmm", [True, False]) -def test_including_method_modifiers( +def xtest_including_method_modifiers( application_pid: int, profiler_state: ProfilerState, include_mmm: bool, @@ -1287,7 +1287,7 @@ def test_including_method_modifiers( @pytest.mark.parametrize("java_line_numbers", ["none", "line-of-function"]) @pytest.mark.parametrize("in_container", [True]) -def test_including_line_numbers( +def xtest_including_line_numbers( application_pid: int, profiler_state: ProfilerState, java_line_numbers: str, diff --git a/tests/test_profiling_mode.py b/tests/test_profiling_mode.py index 6ced99a30..a237286d1 100644 --- a/tests/test_profiling_mode.py +++ b/tests/test_profiling_mode.py @@ -56,7 +56,7 @@ def test_sanity( ("java", "ap", True, "java.lang.String[]"), ], ) -def test_allocation_being_profiled( +def xtest_allocation_being_profiled( application_docker_container: Container, docker_client: DockerClient, gprofiler_docker_image: Image, diff --git a/tests/test_python.py b/tests/test_python.py index 92cd099c1..c4e8943c4 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -39,7 +39,7 @@ def runtime() -> str: @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["libpython"]) -def test_python_select_by_libpython( +def xtest_python_select_by_libpython( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -83,7 +83,7 @@ def test_python_select_by_libpython( ], ) @pytest.mark.parametrize("profiler_type", ["py-spy", "pyperf"]) -def test_python_matrix( +def xtest_python_matrix( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_type: str, diff --git a/tests/test_sanity.py b/tests/test_sanity.py index 7df1238f0..738458f49 100644 --- a/tests/test_sanity.py +++ b/tests/test_sanity.py @@ -50,7 +50,7 @@ @pytest.mark.parametrize("runtime", ["java"]) -def test_java_from_host( +def xtest_java_from_host( tmp_path_world_accessible: Path, application_pid: int, assert_app_id: Callable, @@ -68,7 +68,7 @@ def test_java_from_host( @pytest.mark.parametrize("runtime", ["python"]) -def test_pyspy( +def xtest_pyspy( application_pid: int, assert_collapsed: AssertInCollapsed, assert_app_id: Callable, @@ -108,7 +108,7 @@ def test_phpspy( @pytest.mark.parametrize("runtime", ["ruby"]) -def test_rbspy( +def xtest_rbspy( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, @@ -120,7 +120,7 @@ def test_rbspy( @pytest.mark.parametrize("runtime", ["dotnet"]) -def test_dotnet_trace( +def xtest_dotnet_trace( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, From 86995b192eda4a148e8f63e20721d0c7672c08bb Mon Sep 17 00:00:00 2001 From: slicklash Date: Wed, 12 Jun 2024 09:05:03 +0300 Subject: [PATCH 06/24] up --- granulate-utils | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/granulate-utils b/granulate-utils index 569ed007f..45b17df4b 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit 569ed007ff6b280236d7600c86ca48255c54e83d +Subproject commit 45b17df4bdf9a491678fdd62a9f4f1bed25ca466 From fef1e280493ed9c30389fa1d9df9e1dec9ad6daa Mon Sep 17 00:00:00 2001 From: slicklash Date: Thu, 13 Jun 2024 12:20:49 +0300 Subject: [PATCH 07/24] wait before profiling newly created containers --- gprofiler/containers_client.py | 9 +++- gprofiler/profilers/java.py | 2 +- gprofiler/profilers/php.py | 3 +- gprofiler/profilers/python_ebpf.py | 2 +- gprofiler/utils/__init__.py | 49 +-------------------- tests/test_java.py | 71 ++++++++++++++++-------------- tests/test_profiling_mode.py | 2 +- tests/test_python.py | 4 +- tests/test_sanity.py | 8 ++-- tests/utils.py | 11 ++++- 10 files changed, 68 insertions(+), 93 deletions(-) diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index 0492972ad..1876eba10 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import time from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -25,6 +26,8 @@ logger = get_logger_adapter(__name__) +NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS = 3 + class ContainerNamesClient: def __init__(self) -> None: @@ -73,9 +76,13 @@ def get_container_name(self, pid: int) -> str: def _safely_get_process_container_name(self, pid: int) -> Optional[str]: try: try: - container_id = get_process_container_id(Process(pid)) + process = Process(pid) + container_id = get_process_container_id(process) if container_id is None: return None + # If the container is newly created, we wait a bit to make sure the container is available + if time.time() - process.create_time() <= NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS: + time.sleep(2) except NoSuchProcess: return None return self._get_container_name(container_id) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index a5737541f..c722fc7ce 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -1229,6 +1229,7 @@ def _check_async_profiler_loaded(self, process: Process) -> bool: def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData: comm = process_comm(process) exe = process_exe(process) + container_name = self._profiler_state.get_container_name(process.pid) java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event) if self._enabled_proc_events_java: @@ -1258,7 +1259,6 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr self._profiled_pids.add(process.pid) logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler") - container_name = self._profiler_state.get_container_name(process.pid) app_metadata = self._metadata.get_metadata(process) appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name) diff --git a/gprofiler/profilers/php.py b/gprofiler/profilers/php.py index bab63f266..882592f5c 100644 --- a/gprofiler/profilers/php.py +++ b/gprofiler/profilers/php.py @@ -210,10 +210,11 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str: if profiler_state.processes_to_profile is not None: if pid not in [process.pid for process in profiler_state.processes_to_profile]: continue + container_name = profiler_state.get_container_name(pid) # TODO: appid & app metadata for php! appid = None app_metadata = None - profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid)) + profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name) return profiles diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index e2b44bc9e..8564cf07f 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -262,9 +262,9 @@ def snapshot(self) -> ProcessToProfileData: if self._profiler_state.processes_to_profile is not None: if process not in self._profiler_state.processes_to_profile: continue + container_name = self._profiler_state.get_container_name(pid) appid = application_identifiers.get_python_app_id(process) app_metadata = self._metadata.get_metadata(process) - container_name = self._profiler_state.get_container_name(pid) except NoSuchProcess: appid = None app_metadata = None diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 2fad553bd..97346c4b1 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -90,45 +90,7 @@ def is_root() -> bool: return os.geteuid() == 0 -libc: Optional[ctypes.CDLL] = None - - -def prctl(*argv: Any) -> int: - global libc - if libc is None: - libc = ctypes.CDLL("libc.so.6", use_errno=True) - return cast(int, libc.prctl(*argv)) - - -PR_SET_PDEATHSIG = 1 - - -def set_child_termination_on_parent_death() -> int: - ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM) - if ret != 0: - errno = ctypes.get_errno() - logger.warning( - f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}" - ) - return ret - - -def wrap_callbacks(callbacks: List[Callable]) -> Callable: - # Expects array of callback. - # Returns one callback that call each one of them, and returns the retval of last callback - def wrapper() -> Any: - ret = None - for cb in callbacks: - ret = cb() - - return ret - - return wrapper - - -def start_process( - cmd: Union[str, List[str]], via_staticx: bool = False, term_on_parent_death: bool = True, **kwargs: Any -) -> Popen: +def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwargs: Any) -> Popen: if isinstance(cmd, str): cmd = [cmd] @@ -150,19 +112,12 @@ def start_process( env = env if env is not None else os.environ.copy() env.update({"LD_LIBRARY_PATH": ""}) - if is_windows(): - cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this. - else: - cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp) - if term_on_parent_death: - cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn]) - popen = Popen( cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), stdin=subprocess.PIPE, - preexec_fn=cur_preexec_fn, + start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+ env=env, **kwargs, ) diff --git a/tests/test_java.py b/tests/test_java.py index 6b2f7b8d0..125db51dc 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -118,7 +118,7 @@ def read_ap_version(self: AsyncProfiledProcess) -> str: return version -def xtest_async_profiler_already_running( +def test_async_profiler_already_running( application_pid: int, profiler_state: ProfilerState, assert_collapsed: AssertInCollapsed, @@ -163,7 +163,7 @@ def xtest_async_profiler_already_running( @pytest.mark.parametrize("in_container", [True]) -def xtest_java_async_profiler_cpu_mode( +def test_java_async_profiler_cpu_mode( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -186,7 +186,7 @@ def xtest_java_async_profiler_cpu_mode( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["musl"]) -def xtest_java_async_profiler_musl_and_cpu( +def test_java_async_profiler_musl_and_cpu( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -215,7 +215,7 @@ def test_java_safemode_parameters(profiler_state: ProfilerState) -> None: assert "Java version checks are mandatory in --java-safemode" in str(excinfo.value) -def xtest_java_safemode_version_check( +def test_java_safemode_version_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -226,6 +226,7 @@ def xtest_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -236,7 +237,7 @@ def xtest_java_safemode_version_check( assert log_record_extra(log_record)["jvm_version"] == repr(jvm_version) -def xtest_java_safemode_build_number_check( +def test_java_safemode_build_number_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -245,6 +246,7 @@ def xtest_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -261,10 +263,10 @@ def xtest_java_safemode_build_number_check( [ (False, (), False), # default (False, ("-XX:ErrorFile=/tmp/my_custom_error_file.log",), False), # custom error file - # (True, (), False), # containerized (other params are ignored) + (True, (), False), # containerized (other params are ignored) ], ) -def xtest_hotspot_error_file( +def test_hotspot_error_file( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -297,7 +299,7 @@ def start_async_profiler_and_crash(self: AsyncProfiledProcess, *args: Any, **kwa assert profiler._safemode_disable_reason is not None -def xtest_disable_java_profiling( +def test_disable_java_profiling( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -315,7 +317,7 @@ def xtest_disable_java_profiling( assert "Java profiling has been disabled, skipping profiling of all java process" in caplog.text -def xtest_already_loaded_async_profiler_profiling_failure( +def test_already_loaded_async_profiler_profiling_failure( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -339,7 +341,7 @@ def xtest_already_loaded_async_profiler_profiling_failure( # test only once; and don't test in container - as it will go down once we kill the Java app. @pytest.mark.parametrize("in_container", [False]) @pytest.mark.parametrize("check_app_exited", [False]) # we're killing it, the exit check will raise. -def xtest_async_profiler_output_written_upon_jvm_exit( +def test_async_profiler_output_written_upon_jvm_exit( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -367,7 +369,7 @@ def delayed_kill() -> None: # test only once @pytest.mark.parametrize("in_container", [False]) -def xtest_async_profiler_stops_after_given_timeout( +def test_async_profiler_stops_after_given_timeout( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -400,7 +402,7 @@ def xtest_async_profiler_stops_after_given_timeout( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("j9", "OpenJ9"), ("zing", "Zing")]) -def xtest_sanity_other_jvms( +def test_sanity_other_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -417,6 +419,7 @@ def xtest_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -425,7 +428,7 @@ def xtest_sanity_other_jvms( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("eclipse-temurin-latest", "Temurin")]) -def xtest_sanity_latest_jvms( +def test_sanity_latest_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -438,6 +441,7 @@ def xtest_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) @@ -462,7 +466,7 @@ def simulate_libjvm_delete(application_pid: int) -> None: # test only once. in a container, so that we don't mess up the environment :) @pytest.mark.parametrize("in_container", [True]) -def xtest_java_deleted_libjvm( +def test_java_deleted_libjvm( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -501,7 +505,7 @@ def _filter_record(r: LogRecord) -> bool: pytest.param("ro", [docker.types.Mount(target="/tmpfs", source="", type="tmpfs", read_only=True)], id="ro"), ], ) -def xtest_java_noexec_or_ro_dirs( +def test_java_noexec_or_ro_dirs( tmp_path_world_accessible: Path, # will be used by AP for logs & outputs application_pid: int, extra_application_docker_mounts: List[docker.types.Mount], @@ -569,7 +573,7 @@ def xtest_java_noexec_or_ro_dirs( @pytest.mark.parametrize("in_container", [True]) -def xtest_java_symlinks_in_paths( +def test_java_symlinks_in_paths( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -616,7 +620,7 @@ def xtest_java_symlinks_in_paths( @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def xtest_java_appid_and_metadata_before_process_exits( +def test_java_appid_and_metadata_before_process_exits( application_pid: int, assert_collapsed: AssertInCollapsed, monkeypatch: MonkeyPatch, @@ -657,7 +661,7 @@ def start_async_profiler_and_interrupt(self: AsyncProfiledProcess, *args: Any, * @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def xtest_java_attach_socket_missing( +def test_java_attach_socket_missing( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -680,7 +684,7 @@ def xtest_java_attach_socket_missing( # we know what messages to expect when in container, not on the host Java @pytest.mark.parametrize("in_container", [True]) -def xtest_java_jattach_async_profiler_log_output( +def test_java_jattach_async_profiler_log_output( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -817,7 +821,7 @@ def test_non_java_basename_version( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) -def xtest_dso_name_in_ap_profile( +def test_dso_name_in_ap_profile( application_pid: int, insert_dso_name: bool, profiler_state: ProfilerState, @@ -836,7 +840,7 @@ def xtest_dso_name_in_ap_profile( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) @pytest.mark.parametrize("libc_pattern", [r"(^|;)\(/.*/libc-.*\.so\)($|;)"]) -def xtest_handling_missing_symbol_in_profile( +def test_handling_missing_symbol_in_profile( application_pid: int, insert_dso_name: bool, libc_pattern: str, @@ -852,7 +856,7 @@ def xtest_handling_missing_symbol_in_profile( @pytest.mark.parametrize("in_container", [True]) -def xtest_meminfo_logged( +def test_meminfo_logged( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -869,7 +873,7 @@ def xtest_meminfo_logged( # test that java frames include no semicolon but use a pipe '|' character instead, as implemented by AP @pytest.mark.parametrize("in_container", [True]) -def xtest_java_frames_include_no_semicolons( +def test_java_frames_include_no_semicolons( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -896,7 +900,7 @@ def xtest_java_frames_include_no_semicolons( # test that async profiler doesn't print anything to applications stdout, stderr streams @pytest.mark.parametrize("in_container", [True]) -def xtest_no_stray_output_in_stdout_stderr( +def test_no_stray_output_in_stdout_stderr( application_pid: int, application_docker_container: Container, monkeypatch: MonkeyPatch, @@ -1094,7 +1098,7 @@ def flush_output_and_stop_async_profiler(self: AsyncProfiledProcess, *args: Any, ), ], ) -def xtest_collect_default_jvm_flags( +def test_collect_default_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1178,7 +1182,7 @@ def xtest_collect_default_jvm_flags( ), ], ) -def xtest_collect_cmdline_and_env_jvm_flags( +def test_collect_cmdline_and_env_jvm_flags( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1218,7 +1222,7 @@ def xtest_collect_cmdline_and_env_jvm_flags( @pytest.mark.parametrize("java_cli_flags", ["-XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=95"]) @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def xtest_collect_flags_unsupported_filtered_out( +def test_collect_flags_unsupported_filtered_out( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1246,10 +1250,9 @@ def xtest_collect_flags_unsupported_filtered_out( f"exec java {java_cli_flags} -jar Fibonacci.jar", ], ) as container: - assert ( - profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"])) - == expected_flags - ) + pid = container.attrs["State"]["Pid"] + profiler._profiler_state.get_container_name(pid) + assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records)) # use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8 assert ( @@ -1260,7 +1263,7 @@ def xtest_collect_flags_unsupported_filtered_out( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def xtest_collect_none_jvm_flags( +def test_collect_none_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1272,7 +1275,7 @@ def xtest_collect_none_jvm_flags( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("include_mmm", [True, False]) -def xtest_including_method_modifiers( +def test_including_method_modifiers( application_pid: int, profiler_state: ProfilerState, include_mmm: bool, @@ -1287,7 +1290,7 @@ def xtest_including_method_modifiers( @pytest.mark.parametrize("java_line_numbers", ["none", "line-of-function"]) @pytest.mark.parametrize("in_container", [True]) -def xtest_including_line_numbers( +def test_including_line_numbers( application_pid: int, profiler_state: ProfilerState, java_line_numbers: str, diff --git a/tests/test_profiling_mode.py b/tests/test_profiling_mode.py index a237286d1..6ced99a30 100644 --- a/tests/test_profiling_mode.py +++ b/tests/test_profiling_mode.py @@ -56,7 +56,7 @@ def test_sanity( ("java", "ap", True, "java.lang.String[]"), ], ) -def xtest_allocation_being_profiled( +def test_allocation_being_profiled( application_docker_container: Container, docker_client: DockerClient, gprofiler_docker_image: Image, diff --git a/tests/test_python.py b/tests/test_python.py index c4e8943c4..92cd099c1 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -39,7 +39,7 @@ def runtime() -> str: @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["libpython"]) -def xtest_python_select_by_libpython( +def test_python_select_by_libpython( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -83,7 +83,7 @@ def xtest_python_select_by_libpython( ], ) @pytest.mark.parametrize("profiler_type", ["py-spy", "pyperf"]) -def xtest_python_matrix( +def test_python_matrix( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_type: str, diff --git a/tests/test_sanity.py b/tests/test_sanity.py index 738458f49..7df1238f0 100644 --- a/tests/test_sanity.py +++ b/tests/test_sanity.py @@ -50,7 +50,7 @@ @pytest.mark.parametrize("runtime", ["java"]) -def xtest_java_from_host( +def test_java_from_host( tmp_path_world_accessible: Path, application_pid: int, assert_app_id: Callable, @@ -68,7 +68,7 @@ def xtest_java_from_host( @pytest.mark.parametrize("runtime", ["python"]) -def xtest_pyspy( +def test_pyspy( application_pid: int, assert_collapsed: AssertInCollapsed, assert_app_id: Callable, @@ -108,7 +108,7 @@ def test_phpspy( @pytest.mark.parametrize("runtime", ["ruby"]) -def xtest_rbspy( +def test_rbspy( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, @@ -120,7 +120,7 @@ def xtest_rbspy( @pytest.mark.parametrize("runtime", ["dotnet"]) -def xtest_dotnet_trace( +def test_dotnet_trace( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, diff --git a/tests/utils.py b/tests/utils.py index dc22f136e..73aa8c5cf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -141,6 +141,7 @@ def run_privileged_container( def _no_errors(logs: str) -> None: # example line: [2021-06-12 10:13:57,528] ERROR: gprofiler: ruby profiling failed assert "] ERROR: " not in logs, f"found ERRORs in gProfiler logs!: {logs}" + assert "Could not acquire gProfiler's lock" not in logs, f"found lock error in gProfiler logs!: {logs}" def run_gprofiler_in_container(docker_client: DockerClient, image: Image, command: List[str], **kwargs: Any) -> None: @@ -205,7 +206,15 @@ def assert_ldd_version_container(container: Container, version: str) -> None: def snapshot_pid_profile(profiler: ProfilerInterface, pid: int) -> ProfileData: - return profiler.snapshot()[pid] + last_snapshot = None + + def has_profile() -> bool: + nonlocal last_snapshot + last_snapshot = profiler.snapshot() + return pid in last_snapshot + + wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=1) + return last_snapshot[pid] # type: ignore def snapshot_pid_collapsed(profiler: ProfilerInterface, pid: int) -> StackToSampleCount: From 755fc4ed35e32a1aba456bf2321c5cf1df7cca2e Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 17 Jun 2024 18:42:46 +0300 Subject: [PATCH 08/24] add deprecated --- gprofiler/main.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/gprofiler/main.py b/gprofiler/main.py index 32a72b003..498a2ad76 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -790,11 +790,7 @@ def parse_cmd_args() -> configargparse.Namespace: action="store_true", dest="databricks_job_name_as_service_name", default=False, - help="gProfiler will set service name to Databricks' job name on ephemeral clusters. It'll delay the beginning" - " of the profiling due to repeated waiting for Spark's metrics server." - ' service name format is: "databricks-job-".' - " Note that in any case that the job name is not available due to redaction," - " gProfiler will fallback to use the clusterName property.", + help="Deprecated! Removed in version 1.49.0", ) parser.add_argument( @@ -1002,6 +998,9 @@ def warn_about_deprecated_args(args: configargparse.Namespace) -> None: if args.collect_spark_metrics: logger.warning("--collect-spark-metrics is deprecated and removed in version 1.42.0") + if args.databricks_job_name_as_service_name: + logger.warning("--databricks-job-name-as-service-name is deprecated and removed in version 1.49.0") + def main() -> None: args = parse_cmd_args() @@ -1042,19 +1041,6 @@ def main() -> None: # assume we run in the root cgroup (when containerized, that's our view) usage_logger = CgroupsUsageLogger(logger, "/") if args.log_usage else NoopUsageLogger() - # if args.databricks_job_name_as_service_name: - # # "databricks" will be the default name in case of failure with --databricks-job-name-as-service-name flag - # args.service_name = "databricks" - # dbx_web_ui_wrapper = DBXWebUIEnvWrapper(logger) - # dbx_metadata = dbx_web_ui_wrapper.all_props_dict - # if dbx_metadata is not None: - # service_suffix = get_name_from_metadata(dbx_metadata) - # if service_suffix is not None: - # args.service_name = f"databricks-{service_suffix}" - # - # if remote_logs_handler is not None: - # remote_logs_handler.update_service_name(args.service_name) - try: logger.info( "Running gProfiler", version=__version__, commandline=" ".join(sys.argv[1:]), arguments=args.__dict__ From 4c6b6b34e64b9b2d57321128cd6a8ee61d956cb0 Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 18 Jun 2024 15:53:58 +0300 Subject: [PATCH 09/24] up --- tests/test_perf.py | 3 +++ tests/utils.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_perf.py b/tests/test_perf.py index 04b144a05..cbdb62f2b 100644 --- a/tests/test_perf.py +++ b/tests/test_perf.py @@ -15,6 +15,7 @@ # import logging +import time from threading import Event from typing import Dict, cast @@ -144,6 +145,7 @@ def test_perf_comm_change( I'm not sure it can be done, i.e is this info even kept anywhere). """ with system_profiler as profiler: + time.sleep(2) # first run - we get the changed name, because the app started before perf began recording. _assert_comm_in_profile(profiler, application_pid, False) @@ -170,6 +172,7 @@ def test_perf_thread_comm_is_process_comm( starts after perf, the exec comm of the process should be used (see test_perf_comm_change) """ with system_profiler as profiler: + time.sleep(2) # running perf & script now with --show-task-events would show: # pative 1925947 [010] 987095.272656: PERF_RECORD_COMM: pative:1925904/1925947 # our perf will prefer to use the exec comm, OR oldest comm available if exec diff --git a/tests/utils.py b/tests/utils.py index 73aa8c5cf..b1203dd66 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -206,15 +206,17 @@ def assert_ldd_version_container(container: Container, version: str) -> None: def snapshot_pid_profile(profiler: ProfilerInterface, pid: int) -> ProfileData: - last_snapshot = None + last_snapshot = profiler.snapshot() def has_profile() -> bool: nonlocal last_snapshot + if pid in last_snapshot: + return True last_snapshot = profiler.snapshot() return pid in last_snapshot - wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=1) - return last_snapshot[pid] # type: ignore + wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=0.1) + return last_snapshot[pid] def snapshot_pid_collapsed(profiler: ProfilerInterface, pid: int) -> StackToSampleCount: From f9a5cb2cb2f0b3020e81f0f97e390786c077289a Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 25 Jun 2024 13:39:22 +0300 Subject: [PATCH 10/24] up --- gprofiler/containers_client.py | 9 +-------- gprofiler/profilers/java.py | 2 +- gprofiler/profilers/php.py | 3 +-- gprofiler/profilers/python_ebpf.py | 2 +- tests/test_java.py | 11 ++++------- 5 files changed, 8 insertions(+), 19 deletions(-) diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index 1876eba10..0492972ad 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import time from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -26,8 +25,6 @@ logger = get_logger_adapter(__name__) -NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS = 3 - class ContainerNamesClient: def __init__(self) -> None: @@ -76,13 +73,9 @@ def get_container_name(self, pid: int) -> str: def _safely_get_process_container_name(self, pid: int) -> Optional[str]: try: try: - process = Process(pid) - container_id = get_process_container_id(process) + container_id = get_process_container_id(Process(pid)) if container_id is None: return None - # If the container is newly created, we wait a bit to make sure the container is available - if time.time() - process.create_time() <= NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS: - time.sleep(2) except NoSuchProcess: return None return self._get_container_name(container_id) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index c722fc7ce..a5737541f 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -1229,7 +1229,6 @@ def _check_async_profiler_loaded(self, process: Process) -> bool: def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData: comm = process_comm(process) exe = process_exe(process) - container_name = self._profiler_state.get_container_name(process.pid) java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event) if self._enabled_proc_events_java: @@ -1259,6 +1258,7 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr self._profiled_pids.add(process.pid) logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler") + container_name = self._profiler_state.get_container_name(process.pid) app_metadata = self._metadata.get_metadata(process) appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name) diff --git a/gprofiler/profilers/php.py b/gprofiler/profilers/php.py index 882592f5c..bab63f266 100644 --- a/gprofiler/profilers/php.py +++ b/gprofiler/profilers/php.py @@ -210,11 +210,10 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str: if profiler_state.processes_to_profile is not None: if pid not in [process.pid for process in profiler_state.processes_to_profile]: continue - container_name = profiler_state.get_container_name(pid) # TODO: appid & app metadata for php! appid = None app_metadata = None - profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name) + profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid)) return profiles diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index 8564cf07f..e2b44bc9e 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -262,9 +262,9 @@ def snapshot(self) -> ProcessToProfileData: if self._profiler_state.processes_to_profile is not None: if process not in self._profiler_state.processes_to_profile: continue - container_name = self._profiler_state.get_container_name(pid) appid = application_identifiers.get_python_app_id(process) app_metadata = self._metadata.get_metadata(process) + container_name = self._profiler_state.get_container_name(pid) except NoSuchProcess: appid = None app_metadata = None diff --git a/tests/test_java.py b/tests/test_java.py index 125db51dc..620d7352a 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -226,7 +226,6 @@ def test_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -246,7 +245,6 @@ def test_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -419,7 +417,6 @@ def test_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: - profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -441,7 +438,6 @@ def test_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: - profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) @@ -1250,9 +1246,10 @@ def test_collect_flags_unsupported_filtered_out( f"exec java {java_cli_flags} -jar Fibonacci.jar", ], ) as container: - pid = container.attrs["State"]["Pid"] - profiler._profiler_state.get_container_name(pid) - assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags + assert ( + profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"])) + == expected_flags + ) log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records)) # use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8 assert ( From 076f1a6d97f09c9fc255c7aada16e5e8feb85e71 Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 25 Jun 2024 16:36:39 +0300 Subject: [PATCH 11/24] Revert "up" This reverts commit f9a5cb2cb2f0b3020e81f0f97e390786c077289a. --- gprofiler/containers_client.py | 9 ++++++++- gprofiler/profilers/java.py | 2 +- gprofiler/profilers/php.py | 3 ++- gprofiler/profilers/python_ebpf.py | 2 +- tests/test_java.py | 11 +++++++---- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index 0492972ad..1876eba10 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import time from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -25,6 +26,8 @@ logger = get_logger_adapter(__name__) +NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS = 3 + class ContainerNamesClient: def __init__(self) -> None: @@ -73,9 +76,13 @@ def get_container_name(self, pid: int) -> str: def _safely_get_process_container_name(self, pid: int) -> Optional[str]: try: try: - container_id = get_process_container_id(Process(pid)) + process = Process(pid) + container_id = get_process_container_id(process) if container_id is None: return None + # If the container is newly created, we wait a bit to make sure the container is available + if time.time() - process.create_time() <= NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS: + time.sleep(2) except NoSuchProcess: return None return self._get_container_name(container_id) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index a5737541f..c722fc7ce 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -1229,6 +1229,7 @@ def _check_async_profiler_loaded(self, process: Process) -> bool: def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData: comm = process_comm(process) exe = process_exe(process) + container_name = self._profiler_state.get_container_name(process.pid) java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event) if self._enabled_proc_events_java: @@ -1258,7 +1259,6 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr self._profiled_pids.add(process.pid) logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler") - container_name = self._profiler_state.get_container_name(process.pid) app_metadata = self._metadata.get_metadata(process) appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name) diff --git a/gprofiler/profilers/php.py b/gprofiler/profilers/php.py index bab63f266..882592f5c 100644 --- a/gprofiler/profilers/php.py +++ b/gprofiler/profilers/php.py @@ -210,10 +210,11 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str: if profiler_state.processes_to_profile is not None: if pid not in [process.pid for process in profiler_state.processes_to_profile]: continue + container_name = profiler_state.get_container_name(pid) # TODO: appid & app metadata for php! appid = None app_metadata = None - profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid)) + profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name) return profiles diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index 05535cb50..5c075cccd 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -291,9 +291,9 @@ def snapshot(self) -> ProcessToProfileData: if self._profiler_state.processes_to_profile is not None: if process not in self._profiler_state.processes_to_profile: continue + container_name = self._profiler_state.get_container_name(pid) appid = application_identifiers.get_python_app_id(process) app_metadata = self._metadata.get_metadata(process) - container_name = self._profiler_state.get_container_name(pid) except NoSuchProcess: appid = None app_metadata = None diff --git a/tests/test_java.py b/tests/test_java.py index 620d7352a..125db51dc 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -226,6 +226,7 @@ def test_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -245,6 +246,7 @@ def test_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -417,6 +419,7 @@ def test_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -438,6 +441,7 @@ def test_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) @@ -1246,10 +1250,9 @@ def test_collect_flags_unsupported_filtered_out( f"exec java {java_cli_flags} -jar Fibonacci.jar", ], ) as container: - assert ( - profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"])) - == expected_flags - ) + pid = container.attrs["State"]["Pid"] + profiler._profiler_state.get_container_name(pid) + assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records)) # use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8 assert ( From 5290cfcc16e24f3a172e76e4fcd26bfbde18e97d Mon Sep 17 00:00:00 2001 From: slicklash Date: Wed, 3 Jul 2024 13:56:24 +0300 Subject: [PATCH 12/24] clean up started processes --- gprofiler/utils/__init__.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 97346c4b1..98143c5ba 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import atexit import ctypes import datetime import glob @@ -70,6 +71,10 @@ gprofiler_mutex: Optional[socket.socket] = None +STATUS_KILL = 137 +STATUS_INTERRUPT = 130 +_processes: List[Popen] = [] + @lru_cache(maxsize=None) def resource_path(relative_path: str = "") -> str: @@ -91,6 +96,8 @@ def is_root() -> bool: def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwargs: Any) -> Popen: + global _processes + if isinstance(cmd, str): cmd = [cmd] @@ -112,7 +119,7 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg env = env if env is not None else os.environ.copy() env.update({"LD_LIBRARY_PATH": ""}) - popen = Popen( + process = Popen( cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), @@ -121,7 +128,8 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg env=env, **kwargs, ) - return popen + _processes.append(process) + return process def wait_event(timeout: float, stop_event: Event, condition: Callable[[], bool], interval: float = 0.1) -> None: @@ -507,3 +515,17 @@ def merge_dicts(source: Dict[str, Any], dest: Dict[str, Any]) -> Dict[str, Any]: def is_profiler_disabled(profile_mode: str) -> bool: return profile_mode in ("none", "disabled") + + +def _exit_handler() -> None: + for process in _processes: + process.kill() + + +def _kill_handler(*args: Any) -> None: + sys.exit(STATUS_KILL if args[0] == signal.SIGTERM else STATUS_INTERRUPT) + + +atexit.register(_exit_handler) +signal.signal(signal.SIGINT, _kill_handler) +signal.signal(signal.SIGTERM, _kill_handler) From 2e6b85e306d78d2cb34e9fa7ae11223220acf519 Mon Sep 17 00:00:00 2001 From: slicklash Date: Thu, 4 Jul 2024 14:19:35 +0300 Subject: [PATCH 13/24] centos 7 fix --- executable.Dockerfile | 15 +++++++++++---- scripts/fix_centos7.sh | 8 ++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) create mode 100755 scripts/fix_centos7.sh diff --git a/executable.Dockerfile b/executable.Dockerfile index 7d539631d..b674281cf 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -74,8 +74,10 @@ RUN ./phpspy_build.sh # async-profiler glibc FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc WORKDIR /tmp - -COPY scripts/async_profiler_env_glibc.sh . +COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./ +RUN if grep -q "CentOS Linux" /etc/os-release ; then \ + ./fix_centos7.sh; \ + fi RUN ./async_profiler_env_glibc.sh COPY scripts/async_profiler_build_shared.sh . @@ -139,10 +141,12 @@ RUN ./pyperf_build.sh --with-staticx FROM centos${GPROFILER_BUILDER} AS build-prepare WORKDIR /tmp -COPY scripts/fix_centos8.sh . +COPY scripts/fix_centos7.sh scripts/fix_centos8.sh ./ # fix repo links for CentOS 8, and enable powertools (required to download glibc-static) RUN if grep -q "CentOS Linux 8" /etc/os-release ; then \ ./fix_centos8.sh; \ + elif grep -q "CentOS Linux" /etc/os-release ; then \ + ./fix_centos7.sh; \ fi # update libmodulemd to fix https://bugzilla.redhat.com/show_bug.cgi?id=2004853 @@ -200,7 +204,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip FROM ${NODE_PACKAGE_BUILDER_GLIBC} as node-package-builder-glibc USER 0 WORKDIR /tmp -COPY scripts/node_builder_glibc_env.sh . +COPY scripts/node_builder_glibc_env.sh scripts/fix_centos7.sh ./ +RUN if grep -q "CentOS Linux" /etc/os-release ; then \ + ./fix_centos7.sh; \ + fi RUN ./node_builder_glibc_env.sh COPY scripts/build_node_package.sh . RUN ./build_node_package.sh diff --git a/scripts/fix_centos7.sh b/scripts/fix_centos7.sh new file mode 100755 index 000000000..f4c60a283 --- /dev/null +++ b/scripts/fix_centos7.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +set -eu + +sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo +sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo +sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +yum clean all From 4a229e4070f35f0f2109d83aeaa0953cf294d8d6 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 15 Jul 2024 14:15:25 +0300 Subject: [PATCH 14/24] use setup_singnals --- gprofiler/main.py | 30 ++---------------------------- gprofiler/utils/__init__.py | 34 +++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/gprofiler/main.py b/gprofiler/main.py index 9e845e891..597f85820 100644 --- a/gprofiler/main.py +++ b/gprofiler/main.py @@ -20,13 +20,12 @@ import logging.handlers import os import shutil -import signal import sys import time import traceback from pathlib import Path from threading import Event -from types import FrameType, TracebackType +from types import TracebackType from typing import Iterable, List, Optional, Type, cast import configargparse @@ -74,6 +73,7 @@ reset_umask, resource_path, run_process, + setup_signals, ) from gprofiler.utils.fs import escape_filename, mkdir_owned_root from gprofiler.utils.proxy import get_https_proxy @@ -98,21 +98,6 @@ UPLOAD_FILE_SUBCOMMAND = "upload-file" -# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get. -SIGINT_RATELIMIT = 0.5 - -last_signal_ts: Optional[float] = None - - -def sigint_handler(sig: int, frame: Optional[FrameType]) -> None: - global last_signal_ts - ts = time.monotonic() - # no need for atomicity here: we can't get another SIGINT before this one returns. - # https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler - if last_signal_ts is None or ts > last_signal_ts + SIGINT_RATELIMIT: - last_signal_ts = ts - raise KeyboardInterrupt - class GProfiler: def __init__( @@ -939,17 +924,6 @@ def verify_preconditions(args: configargparse.Namespace, processes_to_profile: O sys.exit(1) -def setup_signals() -> None: - # When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler. - # We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the - # first INT. - # See my commit message for more information. - signal.signal(signal.SIGINT, sigint_handler) - # handle SIGTERM in the same manner - gracefully stop gProfiler. - # SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it. - signal.signal(signal.SIGTERM, sigint_handler) - - def log_system_info() -> None: system_info = get_static_system_info() logger.info(f"gProfiler Python version: {system_info.python_version}") diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 98143c5ba..856841e13 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -34,6 +34,7 @@ from subprocess import CompletedProcess, Popen, TimeoutExpired from tempfile import TemporaryDirectory from threading import Event +from types import FrameType from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast import importlib_resources @@ -71,8 +72,10 @@ gprofiler_mutex: Optional[socket.socket] = None -STATUS_KILL = 137 -STATUS_INTERRUPT = 130 +# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get. +SIGINT_RATELIMIT = 0.5 + +_last_signal_ts: Optional[float] = None _processes: List[Popen] = [] @@ -522,10 +525,23 @@ def _exit_handler() -> None: process.kill() -def _kill_handler(*args: Any) -> None: - sys.exit(STATUS_KILL if args[0] == signal.SIGTERM else STATUS_INTERRUPT) - - -atexit.register(_exit_handler) -signal.signal(signal.SIGINT, _kill_handler) -signal.signal(signal.SIGTERM, _kill_handler) +def _sigint_handler(sig: int, frame: Optional[FrameType]) -> None: + global _last_signal_ts + ts = time.monotonic() + # no need for atomicity here: we can't get another SIGINT before this one returns. + # https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler + if _last_signal_ts is None or ts > _last_signal_ts + SIGINT_RATELIMIT: + _last_signal_ts = ts + raise KeyboardInterrupt + + +def setup_signals() -> None: + atexit.register(_exit_handler) + # When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler. + # We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the + # first INT. + # See my commit message for more information. + signal.signal(signal.SIGINT, _sigint_handler) + # handle SIGTERM in the same manner - gracefully stop gProfiler. + # SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it. + signal.signal(signal.SIGTERM, _sigint_handler) From de24119babc396329aac0ac7ea86ba61bfa6c23d Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 16 Jul 2024 12:39:11 +0300 Subject: [PATCH 15/24] remove container delay single ContainersClient --- gprofiler/containers_client.py | 17 +++++++---------- tests/test.sh | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index 1876eba10..b407ee352 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import time from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -26,14 +25,16 @@ logger = get_logger_adapter(__name__) -NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS = 3 +_containers_client: Optional[ContainersClient] = None class ContainerNamesClient: def __init__(self) -> None: + global _containers_client try: - self._containers_client: Optional[ContainersClient] = ContainersClient() - logger.info(f"Discovered container runtimes: {self._containers_client.get_runtimes()}") + if _containers_client is None: + _containers_client = ContainersClient() + logger.info(f"Discovered container runtimes: {_containers_client.get_runtimes()}") except NoContainerRuntimesError: logger.warning( "Could not find a Docker daemon or CRI-compatible daemon, profiling data will not" @@ -41,7 +42,6 @@ def __init__(self) -> None: " please open a new issue here:" " https://github.com/Granulate/gprofiler/issues/new" ) - self._containers_client = None self._pid_to_container_name_cache: Dict[int, str] = {} self._current_container_names: Set[str] = set() @@ -56,7 +56,7 @@ def container_names(self) -> List[str]: return list(self._current_container_names) def get_container_name(self, pid: int) -> str: - if self._containers_client is None: + if _containers_client is None: return "" if not valid_perf_pid(pid): @@ -80,9 +80,6 @@ def _safely_get_process_container_name(self, pid: int) -> Optional[str]: container_id = get_process_container_id(process) if container_id is None: return None - # If the container is newly created, we wait a bit to make sure the container is available - if time.time() - process.create_time() <= NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS: - time.sleep(2) except NoSuchProcess: return None return self._get_container_name(container_id) @@ -110,5 +107,5 @@ def _get_container_name(self, container_id: str) -> Optional[str]: def _refresh_container_names_cache(self) -> None: # We re-fetch all of the currently running containers, so in order to keep the cache small we clear it self._container_id_to_name_cache.clear() - for container in self._containers_client.list_containers() if self._containers_client is not None else []: + for container in _containers_client.list_containers() if _containers_client is not None else []: self._container_id_to_name_cache[container.id] = container.name diff --git a/tests/test.sh b/tests/test.sh index 14065ef5a..18f547aba 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -48,4 +48,4 @@ fi python3 -m pip install -q --upgrade setuptools pip python3 -m pip install -r ./requirements.txt -r ./exe-requirements.txt -r ./dev-requirements.txt # TODO: python3 -m pip install . -sudo env "PATH=$PATH" python3 -m pytest -v tests/ "$@" +sudo -E env "PATH=$PATH" python3 -m pytest -v tests/ "$@" From f11be0b50a8342c274ed4c4ffade76d2577831d2 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 29 Jul 2024 16:20:12 +0300 Subject: [PATCH 16/24] explicit Centos 7 check --- executable.Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/executable.Dockerfile b/executable.Dockerfile index b674281cf..7b181c6b6 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -75,7 +75,7 @@ RUN ./phpspy_build.sh FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc WORKDIR /tmp COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./ -RUN if grep -q "CentOS Linux" /etc/os-release ; then \ +RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ ./fix_centos7.sh; \ fi RUN ./async_profiler_env_glibc.sh @@ -145,7 +145,7 @@ COPY scripts/fix_centos7.sh scripts/fix_centos8.sh ./ # fix repo links for CentOS 8, and enable powertools (required to download glibc-static) RUN if grep -q "CentOS Linux 8" /etc/os-release ; then \ ./fix_centos8.sh; \ - elif grep -q "CentOS Linux" /etc/os-release ; then \ + elif grep -q "CentOS Linux 7" /etc/os-release ; then \ ./fix_centos7.sh; \ fi @@ -205,7 +205,7 @@ FROM ${NODE_PACKAGE_BUILDER_GLIBC} as node-package-builder-glibc USER 0 WORKDIR /tmp COPY scripts/node_builder_glibc_env.sh scripts/fix_centos7.sh ./ -RUN if grep -q "CentOS Linux" /etc/os-release ; then \ +RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ ./fix_centos7.sh; \ fi RUN ./node_builder_glibc_env.sh From c825eb7e4fba124ea0e13fce44609a17c78f7d7d Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 2 Sep 2024 14:01:28 +0200 Subject: [PATCH 17/24] bump granulate_utils --- granulate-utils | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/granulate-utils b/granulate-utils index 80a9dda7f..5866ef2fe 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit 80a9dda7f8e2309b933d9c3f7a521bd0a0f8f3d5 +Subproject commit 5866ef2fe75ac7c91c8a05b3df88f6edffd91c9a From a5d8045bf16ed8688c7d4f208bd9a43730f16632 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 2 Sep 2024 17:49:15 +0200 Subject: [PATCH 18/24] add pdeathsigger --- executable.Dockerfile | 6 ++++++ scripts/pdeathsigger.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 scripts/pdeathsigger.c diff --git a/executable.Dockerfile b/executable.Dockerfile index 7b181c6b6..248113737 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -211,6 +211,10 @@ RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ RUN ./node_builder_glibc_env.sh COPY scripts/build_node_package.sh . RUN ./build_node_package.sh + +COPY scripts/pdeathsigger.c . +RUN gcc -o pdeathsigger pdeathsigger.c + # needed for hadolint WORKDIR /app USER 1001 @@ -261,6 +265,8 @@ COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncPr COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl COPY --from=node-package-builder-glibc /tmp/module_build gprofiler/resources/node/module/glibc +COPY --from=node-package-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger + COPY --from=burn-builder /tmp/burn/burn gprofiler/resources/burn COPY gprofiler gprofiler diff --git a/scripts/pdeathsigger.c b/scripts/pdeathsigger.c new file mode 100644 index 000000000..fe8f5b98e --- /dev/null +++ b/scripts/pdeathsigger.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include + +/* + preexec_fn is not safe to use in the presence of threads, + child process could deadlock before exec is called. + this little shim is a workaround to avoid using preexe_fn and + still get the desired behavior (PR_SET_PDEATHSIG). +*/ +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "Usage: %s /path/to/binary [args...]\n", argv[0]); + return 1; + } + + if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) { + perror("prctl"); + return 1; + } + + execvp(argv[1], &argv[1]); + + perror("execvp"); + return 1; +} From 31cc0eb2af320c342630ced7699fec329984aee8 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 2 Sep 2024 17:59:22 +0200 Subject: [PATCH 19/24] use pdeathsigger --- gprofiler/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 856841e13..8bfc56713 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -123,7 +123,7 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg env.update({"LD_LIBRARY_PATH": ""}) process = Popen( - cmd, + [resource_path("pdeathsigger")] + cmd if is_linux() else cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), stdin=subprocess.PIPE, From 734538a7328269a384483f867bb983ccee4cc9d0 Mon Sep 17 00:00:00 2001 From: slicklash Date: Mon, 2 Sep 2024 21:47:38 +0200 Subject: [PATCH 20/24] make pgrep_maps compatible with pdeathsigger --- gprofiler/utils/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 8bfc56713..1e867520a 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -104,6 +104,9 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg if isinstance(cmd, str): cmd = [cmd] + if is_linux(): + cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd + logger.debug("Running command", command=cmd) env = kwargs.pop("env", None) @@ -123,7 +126,7 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg env.update({"LD_LIBRARY_PATH": ""}) process = Popen( - [resource_path("pdeathsigger")] + cmd if is_linux() else cmd, + cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), stdin=subprocess.PIPE, @@ -310,10 +313,9 @@ def pgrep_maps(match: str) -> List[Process]: # this is much faster than iterating over processes' maps with psutil. # We use flag -E in grep to support systems where grep is not PCRE result = run_process( - f"grep -lE '{match}' /proc/*/maps", + ["sh", "-c", f"grep -lE '{match}' /proc/*/maps"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - shell=True, suppress_log=True, check=False, ) From 6588841c3669078111106ee2e3b7f2e723fe6f0c Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 3 Sep 2024 16:53:07 +0200 Subject: [PATCH 21/24] build static pdeathsigger --- executable.Dockerfile | 10 ++++------ gprofiler/metadata/system_metadata.py | 7 ++++++- gprofiler/utils/__init__.py | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/executable.Dockerfile b/executable.Dockerfile index 248113737..0f89483d6 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -74,11 +74,12 @@ RUN ./phpspy_build.sh # async-profiler glibc FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc WORKDIR /tmp -COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./ +COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh scripts/pdeathsigger.c ./ RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ ./fix_centos7.sh; \ fi -RUN ./async_profiler_env_glibc.sh +RUN ./async_profiler_env_glibc.sh && \ + gcc -static -o pdeathsigger pdeathsigger.c COPY scripts/async_profiler_build_shared.sh . RUN ./async_profiler_build_shared.sh @@ -212,9 +213,6 @@ RUN ./node_builder_glibc_env.sh COPY scripts/build_node_package.sh . RUN ./build_node_package.sh -COPY scripts/pdeathsigger.c . -RUN gcc -o pdeathsigger pdeathsigger.c - # needed for hadolint WORKDIR /app USER 1001 @@ -260,12 +258,12 @@ COPY --from=async-profiler-builder-glibc /usr/bin/xargs gprofiler/resources/php/ COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/bin/asprof gprofiler/resources/java/asprof COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/async-profiler-version gprofiler/resources/java/async-profiler-version +COPY --from=async-profiler-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger COPY --from=async-profiler-centos-min-test-glibc /libasyncProfiler.so gprofiler/resources/java/glibc/libasyncProfiler.so COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncProfiler.so gprofiler/resources/java/musl/libasyncProfiler.so COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl COPY --from=node-package-builder-glibc /tmp/module_build gprofiler/resources/node/module/glibc -COPY --from=node-package-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger COPY --from=burn-builder /tmp/burn/burn gprofiler/resources/burn diff --git a/gprofiler/metadata/system_metadata.py b/gprofiler/metadata/system_metadata.py index 285ea3859..f2d818929 100644 --- a/gprofiler/metadata/system_metadata.py +++ b/gprofiler/metadata/system_metadata.py @@ -48,7 +48,12 @@ def decode_libc_version(version: bytes) -> str: try: ldd_version = run_process( - ["ldd", "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, suppress_log=True, check=False + ["ldd", "--version"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + suppress_log=True, + check=False, + pdeathsigger=False, ).stdout except FileNotFoundError: ldd_version = b"ldd not found" diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 1e867520a..1c5d6af44 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -2,6 +2,7 @@ # Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -104,7 +105,7 @@ def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwarg if isinstance(cmd, str): cmd = [cmd] - if is_linux(): + if kwargs.pop("pdeathsigger", True) and is_linux(): cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd logger.debug("Running command", command=cmd) From 2fb96111da7e606c3d4a47afb4fe2ff4f8cb2141 Mon Sep 17 00:00:00 2001 From: slicklash Date: Wed, 4 Sep 2024 14:10:22 +0200 Subject: [PATCH 22/24] skip pdeathsigger for versions --- executable.Dockerfile | 2 -- gprofiler/metadata/versions.py | 4 +++- gprofiler/profilers/java.py | 1 + gprofiler/profilers/python.py | 1 + gprofiler/utils/fs.py | 2 +- tests/test_java.py | 11 ++++++----- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/executable.Dockerfile b/executable.Dockerfile index 0f89483d6..11dc41419 100644 --- a/executable.Dockerfile +++ b/executable.Dockerfile @@ -212,7 +212,6 @@ RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \ RUN ./node_builder_glibc_env.sh COPY scripts/build_node_package.sh . RUN ./build_node_package.sh - # needed for hadolint WORKDIR /app USER 1001 @@ -264,7 +263,6 @@ COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncPr COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl COPY --from=node-package-builder-glibc /tmp/module_build gprofiler/resources/node/module/glibc - COPY --from=burn-builder /tmp/burn/burn gprofiler/resources/burn COPY gprofiler gprofiler diff --git a/gprofiler/metadata/versions.py b/gprofiler/metadata/versions.py index a0a3b55bd..5a0e8b700 100644 --- a/gprofiler/metadata/versions.py +++ b/gprofiler/metadata/versions.py @@ -35,7 +35,9 @@ def get_exe_version( exe_path = f"/proc/{get_process_nspid(process.pid)}/exe" def _run_get_version() -> "CompletedProcess[bytes]": - return run_process([exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout) + return run_process( + [exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout, pdeathsigger=False + ) try: cp = run_in_ns(["pid", "mnt"], _run_get_version, process.pid) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index c722fc7ce..1002bd9d5 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -350,6 +350,7 @@ def _run_java_version() -> "CompletedProcess[bytes]": ], stop_event=stop_event, timeout=_JAVA_VERSION_TIMEOUT, + pdeathsigger=False, ) # doesn't work without changing PID NS as well (I'm getting ENOENT for libjli.so) diff --git a/gprofiler/profilers/python.py b/gprofiler/profilers/python.py index 12027cfc6..404e1f3c3 100644 --- a/gprofiler/profilers/python.py +++ b/gprofiler/profilers/python.py @@ -137,6 +137,7 @@ def _run_python_process_in_ns() -> "CompletedProcess[bytes]": [python_path, "-S", "-c", "import sys; print(sys.maxunicode)"], stop_event=self._stop_event, timeout=self._PYTHON_TIMEOUT, + pdeathsigger=False, ) return run_in_ns(["pid", "mnt"], _run_python_process_in_ns, process.pid).stdout.decode().strip() diff --git a/gprofiler/utils/fs.py b/gprofiler/utils/fs.py index 974ebab14..40a50e074 100644 --- a/gprofiler/utils/fs.py +++ b/gprofiler/utils/fs.py @@ -57,7 +57,7 @@ def is_rw_exec_dir(path: Path) -> bool: # try executing try: - run_process([str(test_script)], suppress_log=True) + run_process([str(test_script)], suppress_log=True, pdeathsigger=False) except PermissionError: # noexec return False diff --git a/tests/test_java.py b/tests/test_java.py index 125db51dc..9569ea9f5 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -486,11 +486,12 @@ def _filter_record(r: LogRecord) -> bool: # find the log record of # Running command (command=['/app/gprofiler/resources/java/apsprof', '', 'load', # '/path/to/libasyncProfiler.so', 'true', 'start,...']) + command = log_record_extra(r).get("command", []) return ( r.message == "Running command" - and len(log_record_extra(r)["command"]) == 6 - and log_record_extra(r)["command"][2] == "load" - and any(map(lambda k: k in log_record_extra(r)["command"][5], ["start,", "stop,"])) + and len(command) == 7 + and command[3] == "load" + and any(map(lambda k: k in command[6], ["start,", "stop,"])) ) return list(filter(_filter_record, records)) @@ -567,7 +568,7 @@ def test_java_noexec_or_ro_dirs( assert len(jattach_loads) == 2 # 3rd part of commandline to AP - shall begin with POSSIBLE_AP_DIRS[1] assert all( - log_record_extra(jl)["command"][3].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-") + log_record_extra(jl)["command"][4].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-") for jl in jattach_loads ) @@ -616,7 +617,7 @@ def test_java_symlinks_in_paths( # 2 entries - start and stop assert len(jattach_loads) == 2 # 3rd part of commandline to AP - shall begin with the final, resolved path. - assert all(log_record_extra(jl)["command"][3].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads) + assert all(log_record_extra(jl)["command"][4].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads) @pytest.mark.parametrize("in_container", [True]) # only in container is enough From 562fa825eec581e40018308fc43453301e00e332 Mon Sep 17 00:00:00 2001 From: slicklash Date: Wed, 9 Oct 2024 00:13:26 +0200 Subject: [PATCH 23/24] up --- gprofiler/profilers/java.py | 2 +- gprofiler/profilers/php.py | 3 +-- gprofiler/profilers/python_ebpf.py | 2 +- gprofiler/utils/__init__.py | 3 ++- granulate-utils | 2 +- tests/test.sh | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index 1002bd9d5..fbba33856 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -1230,7 +1230,6 @@ def _check_async_profiler_loaded(self, process: Process) -> bool: def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData: comm = process_comm(process) exe = process_exe(process) - container_name = self._profiler_state.get_container_name(process.pid) java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event) if self._enabled_proc_events_java: @@ -1260,6 +1259,7 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr self._profiled_pids.add(process.pid) logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler") + container_name = self._profiler_state.get_container_name(process.pid) app_metadata = self._metadata.get_metadata(process) appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name) diff --git a/gprofiler/profilers/php.py b/gprofiler/profilers/php.py index 882592f5c..bab63f266 100644 --- a/gprofiler/profilers/php.py +++ b/gprofiler/profilers/php.py @@ -210,11 +210,10 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str: if profiler_state.processes_to_profile is not None: if pid not in [process.pid for process in profiler_state.processes_to_profile]: continue - container_name = profiler_state.get_container_name(pid) # TODO: appid & app metadata for php! appid = None app_metadata = None - profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name) + profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid)) return profiles diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index 14f24c11b..50aa236b0 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -307,9 +307,9 @@ def snapshot(self) -> ProcessToProfileData: if self._profiler_state.processes_to_profile is not None: if process not in self._profiler_state.processes_to_profile: continue - container_name = self._profiler_state.get_container_name(pid) appid = application_identifiers.get_python_app_id(process) app_metadata = self._metadata.get_metadata(process) + container_name = self._profiler_state.get_container_name(pid) except NoSuchProcess: appid = None app_metadata = None diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 9410e1792..b8db9c83f 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -325,9 +325,10 @@ def pgrep_maps(match: str) -> List[Process]: # this is much faster than iterating over processes' maps with psutil. # We use flag -E in grep to support systems where grep is not PCRE result = run_process( - ["sh", "-c", f"grep -lE '{match}' /proc/*/maps"], + f"grep -lE '{match}' /proc/*/maps", stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True, suppress_log=True, check=False, pdeathsigger=False, diff --git a/granulate-utils b/granulate-utils index 5866ef2fe..aadebc3a8 160000 --- a/granulate-utils +++ b/granulate-utils @@ -1 +1 @@ -Subproject commit 5866ef2fe75ac7c91c8a05b3df88f6edffd91c9a +Subproject commit aadebc3a8c376ba612f4fb3b78f14d065c35a027 diff --git a/tests/test.sh b/tests/test.sh index 18f547aba..14065ef5a 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -48,4 +48,4 @@ fi python3 -m pip install -q --upgrade setuptools pip python3 -m pip install -r ./requirements.txt -r ./exe-requirements.txt -r ./dev-requirements.txt # TODO: python3 -m pip install . -sudo -E env "PATH=$PATH" python3 -m pytest -v tests/ "$@" +sudo env "PATH=$PATH" python3 -m pytest -v tests/ "$@" From dbb41c61452cb28b113be8aed477e4ccff1a6522 Mon Sep 17 00:00:00 2001 From: slicklash Date: Tue, 15 Oct 2024 16:28:42 +0200 Subject: [PATCH 24/24] up --- gprofiler/containers_client.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index b407ee352..b8d42a62c 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from threading import Lock from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -26,15 +27,22 @@ logger = get_logger_adapter(__name__) _containers_client: Optional[ContainersClient] = None +_containers_client_lock = Lock() + + +def get_containers_client() -> ContainersClient: + global _containers_client + with _containers_client_lock: + if _containers_client is None: + _containers_client = ContainersClient() + return _containers_client class ContainerNamesClient: def __init__(self) -> None: - global _containers_client try: - if _containers_client is None: - _containers_client = ContainersClient() - logger.info(f"Discovered container runtimes: {_containers_client.get_runtimes()}") + self._containers_client: Optional[ContainersClient] = get_containers_client() + logger.info(f"Discovered container runtimes: {self._containers_client.get_runtimes()}") except NoContainerRuntimesError: logger.warning( "Could not find a Docker daemon or CRI-compatible daemon, profiling data will not" @@ -42,6 +50,7 @@ def __init__(self) -> None: " please open a new issue here:" " https://github.com/Granulate/gprofiler/issues/new" ) + self._containers_client = None self._pid_to_container_name_cache: Dict[int, str] = {} self._current_container_names: Set[str] = set() @@ -56,7 +65,7 @@ def container_names(self) -> List[str]: return list(self._current_container_names) def get_container_name(self, pid: int) -> str: - if _containers_client is None: + if self._containers_client is None: return "" if not valid_perf_pid(pid): @@ -107,5 +116,5 @@ def _get_container_name(self, container_id: str) -> Optional[str]: def _refresh_container_names_cache(self) -> None: # We re-fetch all of the currently running containers, so in order to keep the cache small we clear it self._container_id_to_name_cache.clear() - for container in _containers_client.list_containers() if _containers_client is not None else []: + for container in self._containers_client.list_containers() if self._containers_client is not None else []: self._container_id_to_name_cache[container.id] = container.name