Skip to content

Commit

Permalink
Merge pull request #1335 from milroy/partial-bug
Browse files Browse the repository at this point in the history
Break visitation cycle in mod_dfv to correct cancellation behavior
  • Loading branch information
mergify[bot] authored Feb 10, 2025
2 parents 701acfb + 0fd29d7 commit 1d08fe1
Show file tree
Hide file tree
Showing 19 changed files with 822 additions and 27 deletions.
13 changes: 11 additions & 2 deletions resource/traversers/dfu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ int dfu_traverser_t::find (std::shared_ptr<match_writers_t> &writers, const std:

int dfu_traverser_t::remove (int64_t jobid)
{
int rc = 0;
// Clear the error message to disambiguate errors
clear_err_message ();

Expand All @@ -463,14 +464,19 @@ int dfu_traverser_t::remove (int64_t jobid)
}

vtx_t root = get_graph_db ()->metadata.roots.at (dom);
return detail::dfu_impl_t::remove (root, jobid);

rc = detail::dfu_impl_t::remove (root, jobid);
m_total_preorder = detail::dfu_impl_t::get_preorder_count ();
m_total_postorder = detail::dfu_impl_t::get_postorder_count ();
return rc;
}

int dfu_traverser_t::remove (const std::string &R_to_cancel,
std::shared_ptr<resource_reader_base_t> &reader,
int64_t jobid,
bool &full_cancel)
{
int rc = 0;
// Clear the error message to disambiguate errors
clear_err_message ();

Expand All @@ -483,7 +489,10 @@ int dfu_traverser_t::remove (const std::string &R_to_cancel,
}

vtx_t root = get_graph_db ()->metadata.roots.at (dom);
return detail::dfu_impl_t::remove (root, R_to_cancel, reader, jobid, full_cancel);
rc = detail::dfu_impl_t::remove (root, R_to_cancel, reader, jobid, full_cancel);
m_total_preorder = detail::dfu_impl_t::get_preorder_count ();
m_total_postorder = detail::dfu_impl_t::get_postorder_count ();
return rc;
}

int dfu_traverser_t::mark (const std::string &root_path, resource_pool_t::status_t status)
Expand Down
9 changes: 9 additions & 0 deletions resource/traversers/dfu_impl_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,8 @@ int dfu_impl_t::mod_dfv (vtx_t u, int64_t jobid, modify_data_t &mod_data)
subsystem_t dom = m_match->dom_subsystem ();
f_out_edg_iterator_t ei, ei_end;

m_preorder++;
(*m_graph)[u].idata.colors[dom] = m_color.gray ();
if ((rc = mod_idata (u, jobid, dom, mod_data, stop)) != 0 || stop)
goto done;
if ((rc = mod_plan (u, jobid, mod_data)) != 0)
Expand All @@ -610,6 +612,8 @@ int dfu_impl_t::mod_dfv (vtx_t u, int64_t jobid, modify_data_t &mod_data)
rc += mod_upv (tgt, jobid, mod_data);
}
}
(*m_graph)[u].idata.colors[dom] = m_color.black ();
m_postorder++;
done:
return rc;
}
Expand Down Expand Up @@ -778,6 +782,9 @@ int dfu_impl_t::update (vtx_t root,

int dfu_impl_t::remove (vtx_t root, int64_t jobid)
{
m_preorder = 0;
m_postorder = 0;

bool root_has_jtag =
((*m_graph)[root].idata.tags.find (jobid) != (*m_graph)[root].idata.tags.end ());
modify_data_t mod_data;
Expand All @@ -796,6 +803,8 @@ int dfu_impl_t::remove (vtx_t root,
modify_data_t mod_data;
resource_graph_t &g = m_graph_db->resource_graph;
resource_graph_metadata_t &m = m_graph_db->metadata;
m_preorder = 0;
m_postorder = 0;

if (reader->partial_cancel (g, m, mod_data, R_to_cancel, jobid) != 0) {
m_err_msg += __FUNCTION__;
Expand Down
51 changes: 47 additions & 4 deletions resource/utilities/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ command_t commands[] =
"c",
cmd_cancel,
"Cancel an allocation or reservation: "
"resource-query> cancel jobid"},
"resource-query> cancel jobid (optional subcmd: stats)"},
{"partial-cancel",
"pc",
cmd_partial_cancel,
"Partially release an allocation: "
"resource-query> partial-cancel jobid (file format: jgf | rv1exec) R_to_cancel.file"},
"resource-query> partial-cancel jobid (file format: jgf | rv1exec) R_to_cancel.file "
"(optional subcmd: stats)"},
{"set-property",
"p",
cmd_set_property,
Expand Down Expand Up @@ -651,14 +652,22 @@ int cmd_find (std::shared_ptr<resource_context_t> &ctx, std::vector<std::string>

int cmd_cancel (std::shared_ptr<resource_context_t> &ctx, std::vector<std::string> &args)
{
if (args.size () != 2) {
if (args.size () < 2 || args.size () > 3) {
std::cerr << "ERROR: malformed command" << std::endl;
return 0;
}

int rc = -1;
std::string jobid_str = args[1];
uint64_t jobid = (uint64_t)std::strtoll (jobid_str.c_str (), NULL, 10);
std::string stats = "";
unsigned int preorder_count = 0;
unsigned int postorder_count = 0;
std::ostream &out = (ctx->params.r_fname != "") ? ctx->params.r_out : std::cout;

if (args.size () == 3) {
stats = args[2];
}

if (ctx->allocations.find (jobid) != ctx->allocations.end ()) {
if ((rc = do_remove (ctx, jobid)) == 0)
Expand All @@ -675,6 +684,19 @@ int cmd_cancel (std::shared_ptr<resource_context_t> &ctx, std::vector<std::strin
std::cerr << "ERROR: error encountered while removing job " << jobid << std::endl;
}

if (stats == "stats") {
preorder_count = ctx->traverser->get_total_preorder_count ();
postorder_count = ctx->traverser->get_total_postorder_count ();
out << "INFO:"
<< " =============================" << std::endl;
out << "INFO:"
<< " CANCEL PREORDER COUNT=\"" << preorder_count << "\"" << std::endl;
out << "INFO:"
<< " CANCEL POSTORDER COUNT=\"" << postorder_count << "\"" << std::endl;
out << "INFO:"
<< " =============================" << std::endl;
}

done:
return 0;
}
Expand All @@ -685,7 +707,7 @@ int cmd_partial_cancel (std::shared_ptr<resource_context_t> &ctx, std::vector<st
std::stringstream buffer{};
std::shared_ptr<resource_reader_base_t> rd;

if (args.size () != 4) {
if (args.size () < 4 || args.size () > 5) {
std::cerr << "ERROR: malformed command" << std::endl;
return 0;
}
Expand All @@ -695,6 +717,14 @@ int cmd_partial_cancel (std::shared_ptr<resource_context_t> &ctx, std::vector<st
std::ifstream cancel_file (args[3]);
uint64_t jobid = (uint64_t)std::strtoll (jobid_str.c_str (), NULL, 10);
bool full_cancel = false;
std::string stats = "";
unsigned int preorder_count = 0;
unsigned int postorder_count = 0;
std::ostream &out = (ctx->params.r_fname != "") ? ctx->params.r_out : std::cout;

if (args.size () == 5) {
stats = args[4];
}

if (!(reader == "jgf" || reader == "rv1exec")) {
std::cerr << "ERROR: unsupported reader " << args[2] << std::endl;
Expand Down Expand Up @@ -737,6 +767,19 @@ int cmd_partial_cancel (std::shared_ptr<resource_context_t> &ctx, std::vector<st
std::cerr << "ERROR: error encountered while removing job " << jobid << std::endl;
}

if (stats == "stats") {
preorder_count = ctx->traverser->get_total_preorder_count ();
postorder_count = ctx->traverser->get_total_postorder_count ();
out << "INFO:"
<< " =============================" << std::endl;
out << "INFO:"
<< " PARTIAL CANCEL PREORDER COUNT=\"" << preorder_count << "\"" << std::endl;
out << "INFO:"
<< " PARTIAL CANCEL POSTORDER COUNT=\"" << postorder_count << "\"" << std::endl;
out << "INFO:"
<< " =============================" << std::endl;
}

done:
return 0;
}
Expand Down
6 changes: 3 additions & 3 deletions t/data/resource/commands/cancel/cmds01.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test01
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test015.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test016.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test017.yaml
cancel 1
cancel 1 stats
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test001.yaml
cancel 2
cancel 2 stats
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test002.yaml
cancel 12
cancel 12 stats
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test012.yaml
quit
34 changes: 17 additions & 17 deletions t/data/resource/commands/cancel/cmds02.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,23 @@ match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test01
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test015.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test016.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test017.yaml
cancel 1
cancel 2
cancel 3
cancel 4
cancel 5
cancel 6
cancel 7
cancel 8
cancel 9
cancel 10
cancel 11
cancel 12
cancel 13
cancel 14
cancel 15
cancel 16
cancel 17
cancel 1 stats
cancel 2 stats
cancel 3 stats
cancel 4 stats
cancel 5 stats
cancel 6 stats
cancel 7 stats
cancel 8 stats
cancel 9 stats
cancel 10 stats
cancel 11 stats
cancel 12 stats
cancel 13 stats
cancel 14 stats
cancel 15 stats
cancel 16 stats
cancel 17 stats
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test001.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test002.yaml
match allocate_orelse_reserve @TEST_SRCDIR@/data/resource/jobspecs/cancel/test003.yaml
Expand Down
9 changes: 9 additions & 0 deletions t/data/resource/commands/cancel/cmds10.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
match allocate @TEST_SRCDIR@/data/resource/jobspecs/cancel/test022.yaml
partial-cancel 1 jgf @TEST_SRCDIR@/data/resource/jgfs/elastic/node-1-partial-cancel.json stats
find sched-now=allocated
info 1
cancel 1 stats
find sched-now=allocated
match allocate @TEST_SRCDIR@/data/resource/jobspecs/cancel/test022.yaml
find sched-now=allocated
quit
9 changes: 9 additions & 0 deletions t/data/resource/commands/cancel/cmds11.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
match allocate @TEST_SRCDIR@/data/resource/jobspecs/cancel/test022.yaml
partial-cancel 1 rv1exec @TEST_SRCDIR@/data/resource/rv1exec/cancel/rank0_cancel-jgfgraph.json stats
find sched-now=allocated
info 1
cancel 1 stats
find sched-now=allocated
match allocate @TEST_SRCDIR@/data/resource/jobspecs/cancel/test022.yaml
find sched-now=allocated
quit
12 changes: 12 additions & 0 deletions t/data/resource/expected/cancel/001.R.out
Original file line number Diff line number Diff line change
Expand Up @@ -19911,6 +19911,10 @@ INFO: =============================
INFO: JOBID=17
INFO: RESOURCES=RESERVED
INFO: SCHEDULED AT=147601
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="73"
INFO: CANCEL POSTORDER COUNT="41"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down Expand Up @@ -19957,6 +19961,10 @@ INFO: =============================
INFO: JOBID=18
INFO: RESOURCES=ALLOCATED
INFO: SCHEDULED AT=Now
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="4915"
INFO: CANCEL POSTORDER COUNT="3907"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down Expand Up @@ -23869,6 +23877,10 @@ INFO: =============================
INFO: JOBID=19
INFO: RESOURCES=ALLOCATED
INFO: SCHEDULED AT=Now
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="3151"
INFO: CANCEL POSTORDER COUNT="3141"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down
68 changes: 68 additions & 0 deletions t/data/resource/expected/cancel/002.R.out
Original file line number Diff line number Diff line change
Expand Up @@ -19911,6 +19911,74 @@ INFO: =============================
INFO: JOBID=17
INFO: RESOURCES=RESERVED
INFO: SCHEDULED AT=147601
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="73"
INFO: CANCEL POSTORDER COUNT="41"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="4915"
INFO: CANCEL POSTORDER COUNT="3907"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="2615"
INFO: CANCEL POSTORDER COUNT="2007"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="1679"
INFO: CANCEL POSTORDER COUNT="1285"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="3151"
INFO: CANCEL POSTORDER COUNT="3141"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="667"
INFO: CANCEL POSTORDER COUNT="640"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="427"
INFO: CANCEL POSTORDER COUNT="395"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="763"
INFO: CANCEL POSTORDER COUNT="738"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="400"
INFO: CANCEL POSTORDER COUNT="302"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="667"
INFO: CANCEL POSTORDER COUNT="640"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="409"
INFO: CANCEL POSTORDER COUNT="394"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="3151"
INFO: CANCEL POSTORDER COUNT="3141"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="379"
INFO: CANCEL POSTORDER COUNT="346"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="121"
INFO: CANCEL POSTORDER COUNT="100"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="313"
INFO: CANCEL POSTORDER COUNT="296"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="313"
INFO: CANCEL POSTORDER COUNT="296"
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="2173"
INFO: CANCEL POSTORDER COUNT="2160"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down
12 changes: 12 additions & 0 deletions t/data/resource/expected/cancel/003.R.out
Original file line number Diff line number Diff line change
Expand Up @@ -19911,6 +19911,10 @@ INFO: =============================
INFO: JOBID=17
INFO: RESOURCES=RESERVED
INFO: SCHEDULED AT=147601
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="73"
INFO: CANCEL POSTORDER COUNT="41"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down Expand Up @@ -19957,6 +19961,10 @@ INFO: =============================
INFO: JOBID=18
INFO: RESOURCES=ALLOCATED
INFO: SCHEDULED AT=Now
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="4915"
INFO: CANCEL POSTORDER COUNT="3907"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down Expand Up @@ -23869,6 +23877,10 @@ INFO: =============================
INFO: JOBID=19
INFO: RESOURCES=ALLOCATED
INFO: SCHEDULED AT=Now
INFO: =============================
INFO: =============================
INFO: CANCEL PREORDER COUNT="3151"
INFO: CANCEL POSTORDER COUNT="3141"
INFO: =============================
---------------core0[1:x]
---------------core1[1:x]
Expand Down
Loading

0 comments on commit 1d08fe1

Please sign in to comment.