From b910bd4fba36c9daabc3c4996153111be207bd1f Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 1 Nov 2024 07:05:58 -0600 Subject: [PATCH] feat: shrink support for fluxion This changeset exposes the remove_subgraph function, which we can call a shrink. It does not account for (I do not think) handling jobs properly, but should be a reasonable start to testing or debugging. Signed-off-by: vsoch --- .devcontainer/Dockerfile | 2 +- .github/workflows/main.yaml | 9 ++++++++- cmd/test/test.go | 19 +++++++++++++++++++ pkg/fluxcli/reapi_cli.go | 25 +++++++++++++++++++++---- 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 2b93aff..4f10c78 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -20,7 +20,7 @@ RUN wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz && tar -xvf go${G ENV PATH=$PATH:/usr/local/go/bin:/home/vscode/go/bin # Testing grow/shrink from custom branch -RUN git clone -b debug-resource-error-messages https://github.com/researchapps/flux-sched /opt/flux-sched +RUN git clone -b add-shrink https://github.com/researchapps/flux-sched /opt/flux-sched # RUN git clone https://github.com/flux-framework/flux-sched /opt/flux-sched # We also need to rebuild into the system install diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 9cfc8b5..60db0c7 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -38,7 +38,14 @@ jobs: # TODO: we should consider distributing the header files with the release builds - name: flux-sched build - run: git clone https://github.com/flux-framework/flux-sched /opt/flux-sched + run: git clone -b add-shrink https://github.com/researchapps/flux-sched /opt/flux-sched + - name: flux-sched compile + run: | + cd /opt/flux-sched + cmake -B build + make -C build + make -C build install + cd - - name: Build run: LIB_PREFIX=${{ matrix.test[1] }} make build - name: Test Binary diff --git a/cmd/test/test.go b/cmd/test/test.go index da7fb71..ae45503 100644 --- a/cmd/test/test.go +++ b/cmd/test/test.go @@ -173,6 +173,25 @@ func main() { log.Fatalf("Error in ReapiClient MatchSatisfy - asking for 4 nodes should now succeed: %v\n", err) } + // Shrink (remove subgraph) for node2 + fmt.Println("🥕 Asking to Shrink from 4 to 3 Nodes") + err = cli.Shrink("/tiny0/rack0/node2") + if err != nil { + log.Fatalf("Error in ReapiClient Shrink: %s %s\n", err, cli.GetErrMsg()) + } + fmt.Printf("Shrink request return value: %v\n", err) + + fmt.Println("Asking to MatchSatisfy 4 nodes (again, not possible)") + sat, overhead, err = cli.MatchSatisfy(growJobspec) + checkErrors(cli) + if err != nil { + log.Fatalf("Error in ReapiClient MatchSatisfy: %v\n", err) + } + printSatOutput(sat, err) + if sat { + log.Fatalf("Error in ReapiClient MatchSatisfy - asking for 4 nodes with only 3 should fail: %v\n", err) + } + } func printOutput(reserved bool, allocated string, at int64, jobid uint64, err error) { diff --git a/pkg/fluxcli/reapi_cli.go b/pkg/fluxcli/reapi_cli.go index 2360cea..0c10c89 100644 --- a/pkg/fluxcli/reapi_cli.go +++ b/pkg/fluxcli/reapi_cli.go @@ -220,14 +220,14 @@ func (cli *ReapiClient) UpdateAllocate(jobid int, r string) (at int64, overhead return at, overhead, r_out, err } -// Update the resource state with R. +// Update the resource state with R (grow). // // \param h Opaque handle. How it is used is an implementation -// detail. However, when it is used within a Flux's +// detail. However, when it is used within a Flux's // service module, it is expected to be a pointer // to a flux_t object. -// \param R_subgraph R String -// \return 0 on success; -1 on error. +// \param R_subgraph R String +// \return 0 on success; -1 on error. // int reapi_cli_grow (reapi_cli_ctx_t *ctx, const char *R_subgraph); func (cli *ReapiClient) Grow(rSubgraph string) (err error) { var resources = C.CString(rSubgraph) @@ -237,6 +237,23 @@ func (cli *ReapiClient) Grow(rSubgraph string) (err error) { return retvalToError(fluxerr, "issue resource api client grow") } +// Update the resource state (shrink) with R_node_path. +// +// \param h Opaque handle. How it is used is an implementation +// detail. However, when it is used within a Flux's +// service module, it is expected to be a pointer +// to a flux_t object. +// \param R_node_path R String to prune down +// \return 0 on success; -1 on error. +// int reapi_cli_shrink (reapi_cli_ctx_t *ctx, const char *R_node_path); +func (cli *ReapiClient) Shrink(rNodePath string) (err error) { + var nodePath = C.CString(rNodePath) + defer C.free(unsafe.Pointer(nodePath)) + + fluxerr := (int)(C.reapi_cli_shrink((*C.struct_reapi_cli_ctx)(cli.ctx), nodePath)) + return retvalToError(fluxerr, "issue resource api client shrink") +} + // Cancel cancels the allocation or reservation corresponding to jobid. // // \param jobid jobid of the uint64_t type.