From 44aa8ba8b63a89ecfac5102adca14dd002e3f9c4 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 1 Dec 2024 20:58:15 -0800 Subject: [PATCH] Parse request memory and mem used for slurm jobs --- .../pipeline/SlurmExecutionEngine.java | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java b/cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java index ab610d413..017ce8e46 100644 --- a/cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java +++ b/cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java @@ -247,6 +247,8 @@ protected Pair getStatusForJob(ClusterJob job, Container c) int stateIdx = -1; int hostnameIdx = -1; int maxRssIdx = -1; + int reqMemIdx = -1; + String reqMem = null; for (String line : ret) { line = StringUtils.trimToNull(line); @@ -263,6 +265,7 @@ protected Pair getStatusForJob(ClusterJob job, Container c) stateIdx = header.indexOf("STATE"); hostnameIdx = header.indexOf("NODELIST"); maxRssIdx = header.indexOf("MAXRSS"); + reqMemIdx = header.indexOf("REQMEM"); if (stateIdx == -1) { @@ -303,6 +306,16 @@ else if (headerFound) } } + if (reqMemIdx > -1 && reqMemIdx < tokens.length) + { + String val = StringUtils.trimToNull(tokens[reqMemIdx]); + if (val != null) + { + reqMem = val; + } + + } + // NOTE: if the line has blank ending columns, trimmed lines might lack that value if ((job.getClusterId() + ".0").equals(id) && maxRssIdx > -1 && maxRssIdx < tokens.length) { @@ -312,21 +325,28 @@ else if (headerFound) if (maxRSS != null) { double bytes = FileSizeFormatter.convertStringRepresentationToBytes(maxRSS); - double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(getConfig().getRequestMemory() + "G"); //request is always GB - if (bytes > requestInBytes) + if (reqMem == null) { - info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + getConfig().getRequestMemory() + "G"; - - PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId()); - if (sf != null) + _log.warn("Unable to find ReqMem for slurm job: " + job.getClusterId()); + } + else + { + double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(reqMem); + if (bytes > requestInBytes) { - try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND)) - { - writer.println(info + ". Raw slurm value: " + maxRSS); - } - catch (FileNotFoundException e) + info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + FileSizeFormatter.convertBytesToUnit(requestInBytes, 'G'); + + PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId()); + if (sf != null) { - _log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath()); + try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND)) + { + writer.println(info + ". Raw slurm value: " + maxRSS); + } + catch (FileNotFoundException e) + { + _log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath()); + } } } }