Skip to content

Commit

Permalink
qmanager: full cancel after pcancel not error condition
Browse files Browse the repository at this point in the history
  • Loading branch information
milroy committed Oct 17, 2024
1 parent 2607e1a commit 35907cc
Showing 1 changed file with 12 additions and 9 deletions.
21 changes: 12 additions & 9 deletions qmanager/policies/base/queue_policy_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,23 +667,26 @@ class queue_policy_base_t : public resource_model::queue_adapter_base_t {
auto job_sp = job_it->second;
m_jobs.erase (job_it);
if (final && !full_removal) {
// This error condition indicates a discrepancy between core and sched.
flux_log_error (flux_h,
"%s: Final .free RPC failed to remove all resources for "
"jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
// This error condition can indicate a discrepancy between core and sched,
// but commonly indicates partial cancel didn't clean up resources external
// to a broker rank (e.g., ssds).
flux_log (flux_h,
LOG_DEBUG,
"%s: Final .free RPC failed to remove all resources for "
"jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
// Run a full cancel to clean up all remaining allocated resources
if (cancel (h, job_sp->id, true) != 0) {
flux_log_error (flux_h,
"%s: .free RPC full cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EPROTO;
goto out;
}
errno = EPROTO;
goto out;
}
}
break;
Expand Down

0 comments on commit 35907cc

Please sign in to comment.