Skip to content

Commit

Permalink
Parse request memory and mem used for slurm jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
bbimber committed Dec 2, 2024
1 parent 9a4feeb commit 44aa8ba
Showing 1 changed file with 32 additions and 12 deletions.
44 changes: 32 additions & 12 deletions cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
int stateIdx = -1;
int hostnameIdx = -1;
int maxRssIdx = -1;
int reqMemIdx = -1;
String reqMem = null;
for (String line : ret)
{
line = StringUtils.trimToNull(line);
Expand All @@ -263,6 +265,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
stateIdx = header.indexOf("STATE");
hostnameIdx = header.indexOf("NODELIST");
maxRssIdx = header.indexOf("MAXRSS");
reqMemIdx = header.indexOf("REQMEM");

if (stateIdx == -1)
{
Expand Down Expand Up @@ -303,6 +306,16 @@ else if (headerFound)
}
}

if (reqMemIdx > -1 && reqMemIdx < tokens.length)
{
String val = StringUtils.trimToNull(tokens[reqMemIdx]);
if (val != null)
{
reqMem = val;
}

}

// NOTE: if the line has blank ending columns, trimmed lines might lack that value
if ((job.getClusterId() + ".0").equals(id) && maxRssIdx > -1 && maxRssIdx < tokens.length)
{
Expand All @@ -312,21 +325,28 @@ else if (headerFound)
if (maxRSS != null)
{
double bytes = FileSizeFormatter.convertStringRepresentationToBytes(maxRSS);
double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(getConfig().getRequestMemory() + "G"); //request is always GB
if (bytes > requestInBytes)
if (reqMem == null)
{
info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + getConfig().getRequestMemory() + "G";

PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId());
if (sf != null)
_log.warn("Unable to find ReqMem for slurm job: " + job.getClusterId());
}
else
{
double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(reqMem);
if (bytes > requestInBytes)
{
try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND))
{
writer.println(info + ". Raw slurm value: " + maxRSS);
}
catch (FileNotFoundException e)
info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + FileSizeFormatter.convertBytesToUnit(requestInBytes, 'G');

PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId());
if (sf != null)
{
_log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath());
try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND))
{
writer.println(info + ". Raw slurm value: " + maxRSS);
}
catch (FileNotFoundException e)
{
_log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath());
}
}
}
}
Expand Down

0 comments on commit 44aa8ba

Please sign in to comment.