Skip to content

Commit

Permalink
add pdeathsigger
Browse files Browse the repository at this point in the history
  • Loading branch information
slicklash committed Oct 8, 2024
1 parent 076e9f2 commit e5c8caf
Show file tree
Hide file tree
Showing 12 changed files with 119 additions and 90 deletions.
6 changes: 4 additions & 2 deletions executable.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,12 @@ RUN ./phpspy_build.sh
# async-profiler glibc
FROM centos${AP_BUILDER_CENTOS} AS async-profiler-builder-glibc
WORKDIR /tmp
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh ./
COPY scripts/async_profiler_env_glibc.sh scripts/fix_centos7.sh scripts/pdeathsigger.c ./
RUN if grep -q "CentOS Linux 7" /etc/os-release ; then \
./fix_centos7.sh; \
fi
RUN ./async_profiler_env_glibc.sh
RUN ./async_profiler_env_glibc.sh && \
gcc -static -o pdeathsigger pdeathsigger.c

COPY scripts/async_profiler_build_shared.sh .
RUN ./async_profiler_build_shared.sh
Expand Down Expand Up @@ -256,6 +257,7 @@ COPY --from=async-profiler-builder-glibc /usr/bin/xargs gprofiler/resources/php/

COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/bin/asprof gprofiler/resources/java/asprof
COPY --from=async-profiler-builder-glibc /tmp/async-profiler/build/async-profiler-version gprofiler/resources/java/async-profiler-version
COPY --from=async-profiler-builder-glibc /tmp/pdeathsigger gprofiler/resources/pdeathsigger
COPY --from=async-profiler-centos-min-test-glibc /libasyncProfiler.so gprofiler/resources/java/glibc/libasyncProfiler.so
COPY --from=async-profiler-builder-musl /tmp/async-profiler/build/lib/libasyncProfiler.so gprofiler/resources/java/musl/libasyncProfiler.so
COPY --from=node-package-builder-musl /tmp/module_build gprofiler/resources/node/module/musl
Expand Down
30 changes: 2 additions & 28 deletions gprofiler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@
import logging.handlers
import os
import shutil
import signal
import sys
import time
import traceback
from pathlib import Path
from threading import Event
from types import FrameType, TracebackType
from types import TracebackType
from typing import Iterable, List, Optional, Type, cast

import configargparse
Expand Down Expand Up @@ -74,6 +73,7 @@
reset_umask,
resource_path,
run_process,
setup_signals,
)
from gprofiler.utils.fs import escape_filename, mkdir_owned_root
from gprofiler.utils.proxy import get_https_proxy
Expand All @@ -98,21 +98,6 @@

UPLOAD_FILE_SUBCOMMAND = "upload-file"

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

last_signal_ts: Optional[float] = None


def sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if last_signal_ts is None or ts > last_signal_ts + SIGINT_RATELIMIT:
last_signal_ts = ts
raise KeyboardInterrupt


class GProfiler:
def __init__(
Expand Down Expand Up @@ -939,17 +924,6 @@ def verify_preconditions(args: configargparse.Namespace, processes_to_profile: O
sys.exit(1)


def setup_signals() -> None:
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, sigint_handler)


def log_system_info() -> None:
system_info = get_static_system_info()
logger.info(f"gProfiler Python version: {system_info.python_version}")
Expand Down
7 changes: 6 additions & 1 deletion gprofiler/metadata/system_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ def decode_libc_version(version: bytes) -> str:

try:
ldd_version = run_process(
["ldd", "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, suppress_log=True, check=False
["ldd", "--version"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
suppress_log=True,
check=False,
pdeathsigger=False,
).stdout
except FileNotFoundError:
ldd_version = b"ldd not found"
Expand Down
4 changes: 3 additions & 1 deletion gprofiler/metadata/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def get_exe_version(
exe_path = f"/proc/{get_process_nspid(process.pid)}/exe"

def _run_get_version() -> "CompletedProcess[bytes]":
return run_process([exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout)
return run_process(
[exe_path, version_arg], stop_event=stop_event, timeout=get_version_timeout, pdeathsigger=False
)

try:
cp = run_in_ns(["pid", "mnt"], _run_get_version, process.pid)
Expand Down
1 change: 1 addition & 0 deletions gprofiler/profilers/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def _run_java_version() -> "CompletedProcess[bytes]":
],
stop_event=stop_event,
timeout=_JAVA_VERSION_TIMEOUT,
pdeathsigger=False,
)

# doesn't work without changing PID NS as well (I'm getting ENOENT for libjli.so)
Expand Down
1 change: 1 addition & 0 deletions gprofiler/profilers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def _run_python_process_in_ns() -> "CompletedProcess[bytes]":
[python_path, "-S", "-c", "import sys; print(sys.maxunicode)"],
stop_event=self._stop_event,
timeout=self._PYTHON_TIMEOUT,
pdeathsigger=False,
)

return run_in_ns(["pid", "mnt"], _run_python_process_in_ns, process.pid).stdout.decode().strip()
Expand Down
92 changes: 45 additions & 47 deletions gprofiler/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import atexit
import ctypes
import datetime
import glob
Expand All @@ -33,6 +34,7 @@
from subprocess import CompletedProcess, Popen, TimeoutExpired
from tempfile import TemporaryDirectory
from threading import Event
from types import FrameType
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union, cast

import importlib_resources
Expand Down Expand Up @@ -70,6 +72,12 @@

gprofiler_mutex: Optional[socket.socket] = None

# 1 KeyboardInterrupt raised per this many seconds, no matter how many SIGINTs we get.
SIGINT_RATELIMIT = 0.5

_last_signal_ts: Optional[float] = None
_processes: List[Popen] = []


@lru_cache(maxsize=None)
def resource_path(relative_path: str = "") -> str:
Expand All @@ -90,52 +98,20 @@ def is_root() -> bool:
return os.geteuid() == 0


libc: Optional[ctypes.CDLL] = None


def prctl(*argv: Any) -> int:
global libc
if libc is None:
libc = ctypes.CDLL("libc.so.6", use_errno=True)
return cast(int, libc.prctl(*argv))


PR_SET_PDEATHSIG = 1


def set_child_termination_on_parent_death() -> int:
ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM)
if ret != 0:
errno = ctypes.get_errno()
logger.warning(
f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}"
)
return ret


def wrap_callbacks(callbacks: List[Callable]) -> Callable:
# Expects array of callback.
# Returns one callback that call each one of them, and returns the retval of last callback
def wrapper() -> Any:
ret = None
for cb in callbacks:
ret = cb()

return ret

return wrapper


def start_process(
cmd: Union[str, List[str]],
via_staticx: bool = False,
term_on_parent_death: bool = True,
tmpdir: Optional[Path] = None,
**kwargs: Any,
) -> Popen:
global _processes

if isinstance(cmd, str):
cmd = [cmd]

if kwargs.pop("pdeathsigger", True) and is_linux():
cmd = [resource_path("pdeathsigger")] + cmd if is_linux() else cmd

logger.debug("Running command", command=cmd)

env = kwargs.pop("env", None)
Expand All @@ -161,23 +137,17 @@ def start_process(
# explicitly remove our directory from LD_LIBRARY_PATH
env["LD_LIBRARY_PATH"] = ""

if is_windows():
cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this.
else:
cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp)
if term_on_parent_death:
cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn])

popen = Popen(
process = Popen(
cmd,
stdout=kwargs.pop("stdout", subprocess.PIPE),
stderr=kwargs.pop("stderr", subprocess.PIPE),
stdin=subprocess.PIPE,
preexec_fn=cur_preexec_fn,
start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+
env=env,
**kwargs,
)
return popen
_processes.append(process)
return process


def wait_event(timeout: float, stop_event: Event, condition: Callable[[], bool], interval: float = 0.1) -> None:
Expand Down Expand Up @@ -361,6 +331,7 @@ def pgrep_maps(match: str) -> List[Process]:
shell=True,
suppress_log=True,
check=False,
pdeathsigger=False,
)
# 0 - found
# 1 - not found
Expand Down Expand Up @@ -563,3 +534,30 @@ def merge_dicts(source: Dict[str, Any], dest: Dict[str, Any]) -> Dict[str, Any]:

def is_profiler_disabled(profile_mode: str) -> bool:
return profile_mode in ("none", "disabled")


def _exit_handler() -> None:
for process in _processes:
process.kill()


def _sigint_handler(sig: int, frame: Optional[FrameType]) -> None:
global _last_signal_ts
ts = time.monotonic()
# no need for atomicity here: we can't get another SIGINT before this one returns.
# https://www.gnu.org/software/libc/manual/html_node/Signals-in-Handler.html#Signals-in-Handler
if _last_signal_ts is None or ts > _last_signal_ts + SIGINT_RATELIMIT:
_last_signal_ts = ts
raise KeyboardInterrupt


def setup_signals() -> None:
atexit.register(_exit_handler)
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
# We catch SIGINTs and ratelimit them, to avoid being interrupted again during the handling of the
# first INT.
# See my commit message for more information.
signal.signal(signal.SIGINT, _sigint_handler)
# handle SIGTERM in the same manner - gracefully stop gProfiler.
# SIGTERM is also forwarded by staticx & PyInstaller, so we need to ratelimit it.
signal.signal(signal.SIGTERM, _sigint_handler)
2 changes: 1 addition & 1 deletion gprofiler/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def is_rw_exec_dir(path: Path) -> bool:

# try executing
try:
run_process([str(test_script)], suppress_log=True)
run_process([str(test_script)], suppress_log=True, pdeathsigger=False)
except PermissionError:
# noexec
return False
Expand Down
28 changes: 28 additions & 0 deletions scripts/pdeathsigger.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <signal.h>

/*
preexec_fn is not safe to use in the presence of threads,
child process could deadlock before exec is called.
this little shim is a workaround to avoid using preexe_fn and
still get the desired behavior (PR_SET_PDEATHSIG).
*/
int main(int argc, char *argv[]) {
if (argc < 2) {
fprintf(stderr, "Usage: %s /path/to/binary [args...]\n", argv[0]);
return 1;
}

if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
perror("prctl");
return 1;
}

execvp(argv[1], &argv[1]);

perror("execvp");
return 1;
}
22 changes: 13 additions & 9 deletions tests/test_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def test_java_safemode_version_check(
monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0))

with make_java_profiler(profiler_state) as profiler:
profiler._profiler_state.get_container_name(application_pid)
process = profiler._select_processes_to_profile()[0]
jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event))
jvm_version = parse_jvm_version(jvm_version_str)
Expand All @@ -245,6 +246,7 @@ def test_java_safemode_build_number_check(
profiler_state: ProfilerState,
) -> None:
with make_java_profiler(profiler_state) as profiler:
profiler._profiler_state.get_container_name(application_pid)
process = profiler._select_processes_to_profile()[0]
jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event))
jvm_version = parse_jvm_version(jvm_version_str)
Expand Down Expand Up @@ -417,6 +419,7 @@ def test_sanity_other_jvms(
frequency=99,
java_async_profiler_mode="cpu",
) as profiler:
profiler._profiler_state.get_container_name(application_pid)
process = psutil.Process(application_pid)
assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event))
process_collapsed = snapshot_pid_collapsed(profiler, application_pid)
Expand All @@ -438,6 +441,7 @@ def test_sanity_latest_jvms(
"""

with make_java_profiler(profiler_state) as profiler:
profiler._profiler_state.get_container_name(application_pid)
# sanity check that this is the correct JVM we're targeting
assert search_for in cast_away_optional(
get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event)
Expand Down Expand Up @@ -482,11 +486,12 @@ def _filter_record(r: LogRecord) -> bool:
# find the log record of
# Running command (command=['/app/gprofiler/resources/java/apsprof', '<PID>', 'load',
# '/path/to/libasyncProfiler.so', 'true', 'start,...'])
command = log_record_extra(r).get("command", [])
return (
r.message == "Running command"
and len(log_record_extra(r)["command"]) == 6
and log_record_extra(r)["command"][2] == "load"
and any(map(lambda k: k in log_record_extra(r)["command"][5], ["start,", "stop,"]))
and len(command) == 7
and command[3] == "load"
and any(map(lambda k: k in command[6], ["start,", "stop,"]))
)

return list(filter(_filter_record, records))
Expand Down Expand Up @@ -563,7 +568,7 @@ def test_java_noexec_or_ro_dirs(
assert len(jattach_loads) == 2
# 3rd part of commandline to AP - shall begin with POSSIBLE_AP_DIRS[1]
assert all(
log_record_extra(jl)["command"][3].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-")
log_record_extra(jl)["command"][4].startswith(f"{gprofiler.profilers.java.POSSIBLE_AP_DIRS[1]}/async-profiler-")
for jl in jattach_loads
)

Expand Down Expand Up @@ -612,7 +617,7 @@ def test_java_symlinks_in_paths(
# 2 entries - start and stop
assert len(jattach_loads) == 2
# 3rd part of commandline to AP - shall begin with the final, resolved path.
assert all(log_record_extra(jl)["command"][3].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads)
assert all(log_record_extra(jl)["command"][4].startswith("/run/final_tmp/gprofiler_tmp/") for jl in jattach_loads)


@pytest.mark.parametrize("in_container", [True]) # only in container is enough
Expand Down Expand Up @@ -1246,10 +1251,9 @@ def test_collect_flags_unsupported_filtered_out(
f"exec java {java_cli_flags} -jar Fibonacci.jar",
],
) as container:
assert (
profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"]))
== expected_flags
)
pid = container.attrs["State"]["Pid"]
profiler._profiler_state.get_container_name(pid)
assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags
log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records))
# use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8
assert (
Expand Down
Loading

0 comments on commit e5c8caf

Please sign in to comment.