Skip to content

Commit

Permalink
traverser: add support for canceling brokerless resource vertices
Browse files Browse the repository at this point in the history
Problems: handling partial cancel for brokerless resources when default
pruning filters are set (ALL:core) and pruning filters are set for the
resources excluded from broker ranks (e.g., ALL:ssd). With ALL:core
configured, the final .free RPC frees all planner_multi-tracked
resources, which prevents a cleanup, full cancel. However, tracking
additional resources (e.g., ALL:ssd) successfully removes resource
allocations on those vertices only with a cleanup cancel.

Add capability to detect brokerless resource vertices and cancel the
vertices in a cleanup step.
  • Loading branch information
milroy committed Oct 18, 2024
1 parent 132a214 commit e155693
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 0 deletions.
1 change: 1 addition & 0 deletions resource/readers/resource_reader_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct modify_data_t {
job_modify_t mod_type = job_modify_t::PARTIAL_CANCEL;
std::unordered_set<int64_t> ranks_removed;
std::unordered_map<const char *, int64_t> type_to_count;
std::unordered_set<vtx_t> brokerless_res;
};

/*! Base resource reader class.
Expand Down
1 change: 1 addition & 0 deletions resource/schema/data_std.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ resource_type_t gpu_rt{"gpu"};
resource_type_t node_rt{"node"};
resource_type_t rack_rt{"rack"};
resource_type_t slot_rt{"slot"};
resource_type_t ssd_rt{"ssd"};

} // namespace resource_model
} // namespace Flux
Expand Down
1 change: 1 addition & 0 deletions resource/schema/data_std.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ extern resource_type_t node_rt;
extern resource_type_t socket_rt;
extern resource_type_t gpu_rt;
extern resource_type_t core_rt;
extern resource_type_t ssd_rt;

template<class T, int likely_count = 2>
using subsystem_key_vec = intern::interned_key_vec<subsystem_t, T, likely_count>;
Expand Down
18 changes: 18 additions & 0 deletions resource/traversers/dfu_impl_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,12 @@ int dfu_impl_t::mod_plan (vtx_t u, int64_t jobid, modify_data_t &mod_data)
span = alloc_span->second;
if (mod_data.mod_type != job_modify_t::PARTIAL_CANCEL) {
(*m_graph)[u].schedule.allocations.erase (alloc_span);
} else {
// Add to brokerless resource set to execute in follow-up
// vertex cancel
if ((*m_graph)[u].type == ssd_rt)
mod_data.brokerless_res.insert (u);

Check warning on line 546 in resource/traversers/dfu_impl_update.cpp

View check run for this annotation

Codecov / codecov/patch

resource/traversers/dfu_impl_update.cpp#L546

Added line #L546 was not covered by tests
goto done;
}
} else if ((res_span = (*m_graph)[u].schedule.reservations.find (jobid))
!= (*m_graph)[u].schedule.reservations.end ()) {
Expand Down Expand Up @@ -836,6 +842,18 @@ int dfu_impl_t::remove (vtx_t root,
m_color.reset ();
if (root_has_jtag) {
rc = mod_dfv (root, jobid, mod_data);

// Need to re-run VTX_CANCEL in case brokerless resources (e.g., SSDs)
// were found in the previous VTX_CANCEL
if (mod_data.brokerless_res.size () > 0) {
mod_data.mod_type = job_modify_t::VTX_CANCEL;
for (const vtx_t &vtx : mod_data.brokerless_res) {
if ((rc = cancel_vertex (vtx, mod_data, jobid)) != 0) {
errno = EINVAL;
return rc;

Check warning on line 853 in resource/traversers/dfu_impl_update.cpp

View check run for this annotation

Codecov / codecov/patch

resource/traversers/dfu_impl_update.cpp#L849-L853

Added lines #L849 - L853 were not covered by tests
}
}
}
// Was the root vertex's job tag removed? If so, full_cancel
full_cancel =
((*m_graph)[root].idata.tags.find (jobid) == (*m_graph)[root].idata.tags.end ());
Expand Down

0 comments on commit e155693

Please sign in to comment.