Skip to content

Commit

Permalink
workqueue: Add helper and corelens module to show unsubmitted pending…
Browse files Browse the repository at this point in the history
… works.

dealyed_work(s) get their pending bit set but are actually submitted to a workqueue,
upon expiration of corresponding timer(s).
Recently we have found some cases where a delayed work submitted to an already
offlined CPU was never getting executed, because underlying timers were not
firing in first place. Since the pending bit was set, this gave a notion that
work item was lost to workqueue subsystem (which was not the case here.)

Add an helper and a corelens module to dump delayed_work(s) whose timer has
not yet expired. This is off interest for offline CPUs mainly, because ideally
we should not see any delayed_work timer lying on an offlined CPU. So by default
the helper and corelens module dump this info for offlined CPUs only like shown
in the below snippet:

python3 -m drgn_tools.corelens vmcore -d ~/v5.4/ -M unsubmitted_pending_works
CPU: 4 state: offline
timer: ffff8ce6bd7b3a40 tte(jiffies): 289126 work: ffff8ce6bd7b3a20 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b39e0 tte(jiffies): 289125 work: ffff8ce6bd7b39c0 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b3980 tte(jiffies): 289125 work: ffff8ce6bd7b3960 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b3920 tte(jiffies): 289125 work: ffff8ce6bd7b3900 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b38c0 tte(jiffies): 289124 work: ffff8ce6bd7b38a0 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b3860 tte(jiffies): 289124 work: ffff8ce6bd7b3840 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b3800 tte(jiffies): 289124 work: ffff8ce6bd7b37e0 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b37a0 tte(jiffies): 289124 work: ffff8ce6bd7b3780 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b3740 tte(jiffies): 289124 work: ffff8ce6bd7b3720 func: UNKNOWN: 0xffffffffc0327000
timer: ffff8ce6bd7b36e0 tte(jiffies): 289124 work: ffff8ce6bd7b36c0 func: UNKNOWN: 0xffffffffc0327000

Signed-off-by: Imran Khan <[email protected]>
  • Loading branch information
imran-kn committed Jan 15, 2025
1 parent 0a08b41 commit ab8d798
Showing 1 changed file with 86 additions and 0 deletions.
86 changes: 86 additions & 0 deletions drgn_tools/workqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@
from typing import Union

from drgn import cast
from drgn import container_of
from drgn import FaultError
from drgn import IntegerLike
from drgn import NULL
from drgn import Object
from drgn import Program
from drgn import sizeof
from drgn.helpers.common.format import escape_ascii_string
from drgn.helpers.linux.bitops import for_each_set_bit
from drgn.helpers.linux.cpumask import for_each_online_cpu
from drgn.helpers.linux.cpumask import for_each_possible_cpu
from drgn.helpers.linux.idr import idr_find
from drgn.helpers.linux.idr import idr_for_each
from drgn.helpers.linux.list import hlist_for_each_entry
Expand Down Expand Up @@ -52,6 +58,7 @@
"is_task_a_worker",
"find_worker_executing_work",
"workqueue_get_pwq",
"show_unexpired_delayed_works",
)


Expand Down Expand Up @@ -614,6 +621,85 @@ def find_worker_executing_work(work: Object) -> Object:
return NULL(prog, "struct worker *")


def show_unexpired_delayed_works(
prog: Program, only_offline_cpus: bool = True
) -> None:
"""
Show delayed_work(s) whose timers have not yet expired.
delayed_work(s) get their `WORK_STRUCT_PENDING_BIT` set, but get
submitted only at expiration of corresponding timer.
This helper dumps all delayed_work(s) that have not yet made it to
any worker_pool, due to their timers not firing for one reason or
another.
:param only_offline_cpus: if True only delayed_works on offlined CPUs are shown.
"""
online_cpus = list(for_each_online_cpu(prog))
for cpu in for_each_possible_cpu(prog):
cpu_state = "online" if cpu in online_cpus else "offline"
if only_offline_cpus and cpu in online_cpus:
continue
print(f"CPU: {cpu} state: {cpu_state}")
try:
for timer_base in per_cpu(prog["timer_bases"], cpu):
for idx in for_each_set_bit(
timer_base.pending_map, sizeof(timer_base.pending_map) * 8
):
for timer in hlist_for_each_entry(
"struct timer_list",
timer_base.vectors[idx].address_of_(),
"entry",
):
if (
prog["delayed_work_timer_fn"].address_of_()
== timer.function
):
dwork = container_of(
timer,
"struct delayed_work",
"timer",
)
tte = (
timer.expires.value_()
- prog["jiffies"].value_()
)
work = dwork.work.address_
try:
func = prog.symbol(
dwork.work.func.value_()
).name
except LookupError:
func = (
f"UNKNOWN: 0x{dwork.work.func.value_():x}"
)
print(
f"timer: {timer.value_():x} tte(jiffies): {tte} work: {work:x} func: {func}"
)

except FaultError:
continue


class OfflinedDelayedWorksModule(CorelensModule):
"""
Show delayed works from offlined CPUs.
Delayed works (with non zero delay), rely on timer-wheel timers for
their submission. If these timers don't fire the work does not get
submitted. So delayed works submitted to an offlined CPU, don't get
executed even after specified delay because timer-wheel timers on
offlined CPUs don't get fired in first place.
This corelens module list delayed works on offlined CPUs, so that
one can know if a delayed work was left unexececuted, due to the fact
that it was submitted on an offlined CPU.
"""

name = "offlined_delayed_works"

def run(self, prog: Program, args: argparse.Namespace) -> None:
show_unexpired_delayed_works(prog)


class WorkqueueModule(CorelensModule):
"""Show details about all workqueues"""

Expand Down

0 comments on commit ab8d798

Please sign in to comment.