diff --git a/resource/modules/resource_match.cpp b/resource/modules/resource_match.cpp index f8554555a..6fe13263e 100644 --- a/resource/modules/resource_match.cpp +++ b/resource/modules/resource_match.cpp @@ -1908,6 +1908,11 @@ static int run_remove (std::shared_ptr &ctx, { int rc = -1; dfu_traverser_t &tr = *(ctx->traverser); + std::shared_ptr info = nullptr; + bool pcanceled = false; + + if (is_existent_jobid (ctx, jobid)) + info = ctx->jobs[jobid]; if (part_cancel) { // RV1exec only reader supported in production currently @@ -1921,13 +1926,18 @@ static int run_remove (std::shared_ptr &ctx, static_cast (jobid)); goto out; } + if (info) + info->set_pcanceled (); rc = tr.remove (R, reader, jobid, full_removal); } else { - rc = tr.remove (jobid); + if (info) + pcanceled = info->get_pcancel_state (); + + rc = tr.remove (jobid, pcanceled); full_removal = true; } if (rc != 0) { - if (is_existent_jobid (ctx, jobid)) { + if (info) { // When this condition arises, we will be less likely // to be able to reuse this jobid. Having the errored job // in the jobs map will prevent us from reusing the jobid @@ -1935,7 +1945,6 @@ static int run_remove (std::shared_ptr &ctx, // removed multiple times by the upper queuing layer // as part of providing advanced queueing policies // (e.g., conservative backfill). - std::shared_ptr info = ctx->jobs[jobid]; info->state = job_lifecycle_t::ERROR; } flux_log (ctx->h, diff --git a/resource/planner/c/planner_multi.h b/resource/planner/c/planner_multi.h index e195ed8a9..2d5ed677e 100644 --- a/resource/planner/c/planner_multi.h +++ b/resource/planner/c/planner_multi.h @@ -278,13 +278,15 @@ int64_t planner_multi_add_span (planner_multi_t *ctx, * \param ctx opaque multi-planner context returned * from planner_multi_new. * \param span_id span_id returned from planner_multi_add_span. + * \param post_pcancel bool indicating whether preceded by one or more partial + * cancels * \return 0 on success; -1 on error with errno set as follows: * EINVAL: invalid argument. * EKEYREJECTED: span could not be removed from * the planner's internal data structures. * ERANGE: a resource state became out of a valid range. */ -int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id); +int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id, bool post_pcancel); /*! Reduce the existing span's resources from the planner. * This function will be called for a partial release/cancel. diff --git a/resource/planner/c/planner_multi_c_interface.cpp b/resource/planner/c/planner_multi_c_interface.cpp index 97c6f8104..cff641eeb 100644 --- a/resource/planner/c/planner_multi_c_interface.cpp +++ b/resource/planner/c/planner_multi_c_interface.cpp @@ -408,7 +408,7 @@ extern "C" int64_t planner_multi_add_span (planner_multi_t *ctx, return mspan; } -extern "C" int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id) +extern "C" int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id, bool post_pcancel) { size_t i; int rc = -1; @@ -423,8 +423,12 @@ extern "C" int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id) goto done; } for (i = 0; i < it->second.size (); ++i) { - if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1) - goto done; + if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1) { + // If executed after partial cancel, depending on pruning filter settings + // some spans may no longer exist. In that case there is no error. + if (!post_pcancel) + goto done; + } } ctx->plan_multi->get_span_lookup ().erase (it); rc = 0; diff --git a/resource/planner/test/planner_test02.cpp b/resource/planner/test/planner_test02.cpp index a9d5481d0..c2170744f 100644 --- a/resource/planner/test/planner_test02.cpp +++ b/resource/planner/test/planner_test02.cpp @@ -528,7 +528,7 @@ static int test_multi_add_remove () span3 = planner_multi_add_span (ctx, 2000, 1000, request3, len); ok ((span3 != -1), "span added for (%s)", ss.str ().c_str ()); - rc = planner_multi_rem_span (ctx, span2); + rc = planner_multi_rem_span (ctx, span2, false); ok (!rc, "multi_rem_span works"); size = planner_multi_span_size (ctx); @@ -582,7 +582,7 @@ static int test_constructors_and_overload () bo = (bo || !(planner_multis_equal (ctx, ctx2))); ok (!bo, "test copy constructor doesn't mutate planner"); // Compare planners after mutation - rc = planner_multi_rem_span (ctx2, span); + rc = planner_multi_rem_span (ctx2, span, false); size = planner_multi_span_size (ctx2); ok ((size == 2), "planner_multi_span_size works after copy"); diff --git a/resource/readers/resource_reader_base.hpp b/resource/readers/resource_reader_base.hpp index 2b7252834..9c702c3fd 100644 --- a/resource/readers/resource_reader_base.hpp +++ b/resource/readers/resource_reader_base.hpp @@ -22,12 +22,13 @@ namespace Flux { namespace resource_model { -enum class job_modify_t { CANCEL, PARTIAL_CANCEL, VTX_CANCEL }; +enum class job_modify_t { CANCEL, POST_PCANCEL, PARTIAL_CANCEL, VTX_CANCEL }; struct modify_data_t { job_modify_t mod_type = job_modify_t::PARTIAL_CANCEL; std::unordered_set ranks_removed; std::unordered_map type_to_count; + std::unordered_set brokerless_res; }; /*! Base resource reader class. diff --git a/resource/readers/resource_reader_jgf.cpp b/resource/readers/resource_reader_jgf.cpp index 6f8f6f3c7..ced75a86a 100644 --- a/resource/readers/resource_reader_jgf.cpp +++ b/resource/readers/resource_reader_jgf.cpp @@ -842,7 +842,7 @@ int resource_reader_jgf_t::cancel_vtx (vtx_t vtx, if (agg_span != job2span.end ()) { if ((subtree_plan = g[vtx].idata.subplans[containment_sub]) == NULL) goto ret; - if (planner_multi_rem_span (subtree_plan, agg_span->second) != 0) + if (planner_multi_rem_span (subtree_plan, agg_span->second, false) != 0) goto ret; // Delete from job2span tracker job2span.erase (update_data.jobid); diff --git a/resource/reapi/bindings/c++/reapi_cli_impl.hpp b/resource/reapi/bindings/c++/reapi_cli_impl.hpp index 1d81410fe..6669b29a7 100644 --- a/resource/reapi/bindings/c++/reapi_cli_impl.hpp +++ b/resource/reapi/bindings/c++/reapi_cli_impl.hpp @@ -668,18 +668,23 @@ void resource_query_t::set_job (const uint64_t jobid, const std::shared_ptr info = nullptr; + bool pcanceled = false; if (jobid > (uint64_t)std::numeric_limits::max ()) { errno = EOVERFLOW; return rc; } - rc = traverser->remove (static_cast (jobid)); + if (jobs.find (jobid) != jobs.end ()) { + info = jobs[jobid]; + pcanceled = info->get_pcancel_state (); + } + + rc = traverser->remove (static_cast (jobid), pcanceled); if (rc == 0) { - if (jobs.find (jobid) != jobs.end ()) { - std::shared_ptr info = jobs[jobid]; + if (info) info->state = job_lifecycle_t::CANCELED; - } } else { m_err_msg += traverser->err_message (); traverser->clear_err_message (); @@ -691,6 +696,7 @@ int resource_query_t::remove_job (const uint64_t jobid, const std::string &R, bo { int rc = -1; std::shared_ptr reader; + std::shared_ptr info = nullptr; if (jobid > (uint64_t)std::numeric_limits::max ()) { errno = EOVERFLOW; @@ -706,14 +712,15 @@ int resource_query_t::remove_job (const uint64_t jobid, const std::string &R, bo return rc; } + if (jobs.find (jobid) != jobs.end ()) { + info = jobs[jobid]; + info->set_pcanceled (); + } + rc = traverser->remove (R, reader, static_cast (jobid), full_removal); if (rc == 0) { - if (full_removal) { - auto job_info_it = jobs.find (jobid); - if (job_info_it != jobs.end ()) { - job_info_it->second->state = job_lifecycle_t::CANCELED; - } - } + if (full_removal && info) + info->state = job_lifecycle_t::CANCELED; } else { m_err_msg += traverser->err_message (); traverser->clear_err_message (); diff --git a/resource/traversers/dfu.cpp b/resource/traversers/dfu.cpp index f331c3123..59f2bd213 100644 --- a/resource/traversers/dfu.cpp +++ b/resource/traversers/dfu.cpp @@ -438,7 +438,7 @@ int dfu_traverser_t::find (std::shared_ptr &writers, const std: return detail::dfu_impl_t::find (writers, criteria); } -int dfu_traverser_t::remove (int64_t jobid) +int dfu_traverser_t::remove (int64_t jobid, bool pcanceled) { subsystem_t dom = get_match_cb ()->dom_subsystem (); if (!get_graph () || !get_graph_db () @@ -449,7 +449,7 @@ int dfu_traverser_t::remove (int64_t jobid) } vtx_t root = get_graph_db ()->metadata.roots.at (dom); - return detail::dfu_impl_t::remove (root, jobid); + return detail::dfu_impl_t::remove (root, jobid, pcanceled); } int dfu_traverser_t::remove (const std::string &R_to_cancel, diff --git a/resource/traversers/dfu.hpp b/resource/traversers/dfu.hpp index 1db95343e..c9d032a9f 100644 --- a/resource/traversers/dfu.hpp +++ b/resource/traversers/dfu.hpp @@ -159,10 +159,12 @@ class dfu_traverser_t : protected detail::dfu_impl_t { * the resource state. * * \param jobid job id. + * \param pcanceled bool indicating whether preceded by one or more partial + * cancels * \return 0 on success; -1 on error. * EINVAL: graph, roots or match callback not set. */ - int remove (int64_t jobid); + int remove (int64_t jobid, bool pcanceled); /*! Remove the allocation/reservation referred to by jobid and update * the resource state. diff --git a/resource/traversers/dfu_impl.hpp b/resource/traversers/dfu_impl.hpp index ad78c11c9..21edd0bbd 100644 --- a/resource/traversers/dfu_impl.hpp +++ b/resource/traversers/dfu_impl.hpp @@ -284,9 +284,11 @@ class dfu_impl_t { * * \param root root resource vertex. * \param jobid job id. + * \param pcanceled bool indicating whether preceded by one or more partial + * cancels * \return 0 on success; -1 on error. */ - int remove (vtx_t root, int64_t jobid); + int remove (vtx_t root, int64_t jobid, bool pcanceled); /*! Remove the allocation/reservation referred to by jobid and update * the resource state. diff --git a/resource/traversers/dfu_impl_update.cpp b/resource/traversers/dfu_impl_update.cpp index 7ea2b12b0..abae3f59e 100644 --- a/resource/traversers/dfu_impl_update.cpp +++ b/resource/traversers/dfu_impl_update.cpp @@ -433,7 +433,11 @@ int dfu_impl_t::mod_agfilter (vtx_t u, goto done; } if (mod_data.mod_type != job_modify_t::PARTIAL_CANCEL) { - if ((rc = planner_multi_rem_span (subtree_plan, span_it->second)) != 0) { + bool post_pcancel = false; + if (mod_data.mod_type == job_modify_t::POST_PCANCEL) + post_pcancel = true; + + if ((rc = planner_multi_rem_span (subtree_plan, span_it->second, post_pcancel)) != 0) { m_err_msg += __FUNCTION__; m_err_msg += ": planner_multi_rem_span returned -1.\n"; m_err_msg += (*m_graph)[u].name + ".\n"; @@ -536,6 +540,9 @@ int dfu_impl_t::mod_plan (vtx_t u, int64_t jobid, modify_data_t &mod_data) if (mod_data.mod_type != job_modify_t::PARTIAL_CANCEL) { (*m_graph)[u].schedule.allocations.erase (alloc_span); } else { + // Need to convert to c_str () because interner is finalized + if (std::string ((*m_graph)[u].type.c_str ()) == "ssd") + mod_data.brokerless_res.insert (u); goto done; } } else if ((res_span = (*m_graph)[u].schedule.reservations.find (jobid)) @@ -771,12 +778,16 @@ int dfu_impl_t::update (vtx_t root, return (rc > 0) ? 0 : -1; } -int dfu_impl_t::remove (vtx_t root, int64_t jobid) +int dfu_impl_t::remove (vtx_t root, int64_t jobid, bool pcanceled) { bool root_has_jtag = ((*m_graph)[root].idata.tags.find (jobid) != (*m_graph)[root].idata.tags.end ()); modify_data_t mod_data; - mod_data.mod_type = job_modify_t::CANCEL; + + if (pcanceled) + mod_data.mod_type = job_modify_t::POST_PCANCEL; + else + mod_data.mod_type = job_modify_t::CANCEL; m_color.reset (); return (root_has_jtag) ? mod_dfv (root, jobid, mod_data) : mod_exv (jobid, mod_data); } @@ -829,6 +840,18 @@ int dfu_impl_t::remove (vtx_t root, m_color.reset (); if (root_has_jtag) { rc = mod_dfv (root, jobid, mod_data); + + // Need to re-run VTX_CANCEL in case brokerless resources (e.g., SSDs) + // were found in the previous VTX_CANCEL + if (mod_data.brokerless_res.size () > 0) { + mod_data.mod_type = job_modify_t::VTX_CANCEL; + for (const vtx_t &vtx : mod_data.brokerless_res) { + if ((rc = cancel_vertex (vtx, mod_data, jobid)) != 0) { + errno = EINVAL; + return rc; + } + } + } // Was the root vertex's job tag removed? If so, full_cancel full_cancel = ((*m_graph)[root].idata.tags.find (jobid) == (*m_graph)[root].idata.tags.end ()); diff --git a/resource/utilities/command.cpp b/resource/utilities/command.cpp index 782d0dd0f..ae2f9643b 100644 --- a/resource/utilities/command.cpp +++ b/resource/utilities/command.cpp @@ -108,11 +108,16 @@ command_t commands[] = static int do_remove (std::shared_ptr &ctx, int64_t jobid) { int rc = -1; - if ((rc = ctx->traverser->remove ((int64_t)jobid)) == 0) { - if (ctx->jobs.find (jobid) != ctx->jobs.end ()) { - std::shared_ptr info = ctx->jobs[jobid]; + bool pcanceled = false; + std::shared_ptr info = nullptr; + + if (ctx->jobs.find (jobid) != ctx->jobs.end ()) { + info = ctx->jobs[jobid]; + pcanceled = info->get_pcancel_state (); + } + if ((rc = ctx->traverser->remove ((int64_t)jobid, pcanceled)) == 0) { + if (info) info->state = job_lifecycle_t::CANCELED; - } } else { std::cout << ctx->traverser->err_message (); ctx->traverser->clear_err_message (); @@ -127,12 +132,15 @@ static int do_partial_remove (std::shared_ptr &ctx, bool &full_cancel) { int rc = -1; + std::shared_ptr info = nullptr; + if (ctx->jobs.find (jobid) != ctx->jobs.end ()) { + info = ctx->jobs[jobid]; + info->set_pcanceled (); + } if ((rc = ctx->traverser->remove (R_cancel, reader, (int64_t)jobid, full_cancel)) == 0) { - if (full_cancel && (ctx->jobs.find (jobid) != ctx->jobs.end ())) { - std::shared_ptr info = ctx->jobs[jobid]; + if (full_cancel && info) info->state = job_lifecycle_t::CANCELED; - } } else { std::cout << ctx->traverser->err_message (); ctx->traverser->clear_err_message ();