Skip to content

Commit

Permalink
java: Increase jattach timeout & make it configurable (#348)
Browse files Browse the repository at this point in the history
In some cases it might be to use a higher timeout, even higher than the new 30s, e.g if process is very
loaded and AP takes a long while to dump the traces.
  • Loading branch information
Jongy authored Apr 26, 2022
1 parent ae33247 commit c60429d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 5 deletions.
26 changes: 22 additions & 4 deletions gprofiler/profilers/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

from gprofiler import merge
from gprofiler.exceptions import CalledProcessError, NoRwExecDirectoryFoundError
from gprofiler.gprofiler_types import ProcessToProfileData, ProfileData, StackToSampleCount
from gprofiler.gprofiler_types import ProcessToProfileData, ProfileData, StackToSampleCount, positive_integer
from gprofiler.kernel_messages import get_kernel_messages_provider
from gprofiler.log import get_logger_adapter
from gprofiler.metadata import application_identifiers
Expand Down Expand Up @@ -210,7 +210,7 @@ class AsyncProfiledProcess:

# timeouts in seconds
_FDTRANSFER_TIMEOUT = 10
_JATTACH_TIMEOUT = 10 # higher than jattach's timeout
_JATTACH_TIMEOUT = 30 # higher than jattach's timeout

def __init__(
self,
Expand All @@ -221,6 +221,7 @@ def __init__(
mode: str,
ap_safemode: int,
ap_args: str,
jattach_timeout: int = _JATTACH_TIMEOUT,
):
self.process = process
self._stop_event = stop_event
Expand Down Expand Up @@ -266,6 +267,7 @@ def __init__(
self._mode = mode
self._ap_safemode = ap_safemode
self._ap_args = ap_args
self._jattach_timeout = jattach_timeout

def _find_rw_exec_dir(self, available_dirs: Sequence[str]) -> str:
"""
Expand Down Expand Up @@ -396,7 +398,7 @@ def _get_stop_cmd(self, with_output: bool) -> List[str]:
def _run_async_profiler(self, cmd: List[str]) -> None:
try:
# kill jattach with SIGTERM if it hangs. it will go down
run_process(cmd, stop_event=self._stop_event, timeout=self._JATTACH_TIMEOUT, kill_signal=signal.SIGTERM)
run_process(cmd, stop_event=self._stop_event, timeout=self._jattach_timeout, kill_signal=signal.SIGTERM)
except CalledProcessError as e: # catches timeouts as well
if os.path.exists(self._log_path_host):
log = Path(self._log_path_host)
Expand Down Expand Up @@ -580,6 +582,13 @@ def parse_jvm_version(version_string: str) -> JvmVersion:
default=",".join(JAVA_SAFEMODE_DEFAULT_OPTIONS),
help="Sets the Java profiler safemode options. Default is: %(default)s.",
),
ProfilerArgument(
"--java-jattach-timeout",
dest="java_jattach_timeout",
type=positive_integer,
default=AsyncProfiledProcess._JATTACH_TIMEOUT,
help="Timeout for jattach operations (start/stop AP, etc)",
),
],
)
class JavaProfiler(ProcessProfilerBase):
Expand Down Expand Up @@ -613,6 +622,7 @@ def __init__(
java_async_profiler_safemode: int,
java_async_profiler_args: str,
java_safemode: str,
java_jattach_timeout: int,
java_mode: str,
):
assert java_mode == "ap", "Java profiler should not be initialized, wrong java_mode value given"
Expand All @@ -627,6 +637,7 @@ def __init__(
self._init_ap_mode(java_async_profiler_mode)
self._ap_safemode = java_async_profiler_safemode
self._ap_args = java_async_profiler_args
self._jattach_timeout = java_jattach_timeout
self._init_java_safemode(java_safemode)
self._should_profile = True
# if set, profiling is disabled due to this safemode reason.
Expand Down Expand Up @@ -785,7 +796,14 @@ def _profile_process_stackcollapse(self, process: Process) -> StackToSampleCount
logger.info(f"Profiling process {process.pid} with async-profiler")

with AsyncProfiledProcess(
process, self._storage_dir, self._stop_event, self._buildids, self._mode, self._ap_safemode, self._ap_args
process,
self._storage_dir,
self._stop_event,
self._buildids,
self._mode,
self._ap_safemode,
self._ap_args,
self._jattach_timeout,
) as ap_proc:
return self._profile_ap_process(ap_proc, comm)

Expand Down
9 changes: 8 additions & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
from docker.models.images import Image

from gprofiler.gprofiler_types import ProfileData, StackToSampleCount
from gprofiler.profilers.java import JAVA_ASYNC_PROFILER_DEFAULT_SAFEMODE, JAVA_SAFEMODE_ALL, JavaProfiler
from gprofiler.profilers.java import (
JAVA_ASYNC_PROFILER_DEFAULT_SAFEMODE,
JAVA_SAFEMODE_ALL,
AsyncProfiledProcess,
JavaProfiler,
)
from gprofiler.profilers.profiler_base import ProfilerInterface
from gprofiler.utils import remove_path

Expand Down Expand Up @@ -131,6 +136,7 @@ def make_java_profiler(
java_async_profiler_safemode: int = JAVA_ASYNC_PROFILER_DEFAULT_SAFEMODE,
java_async_profiler_args: str = "",
java_safemode: str = JAVA_SAFEMODE_ALL,
java_jattach_timeout: int = AsyncProfiledProcess._JATTACH_TIMEOUT,
java_mode: str = "ap",
) -> JavaProfiler:
assert storage_dir is not None
Expand All @@ -145,6 +151,7 @@ def make_java_profiler(
java_async_profiler_safemode=java_async_profiler_safemode,
java_async_profiler_args=java_async_profiler_args,
java_safemode=java_safemode,
java_jattach_timeout=java_jattach_timeout,
java_mode=java_mode,
)

Expand Down

0 comments on commit c60429d

Please sign in to comment.