Skip to content

Commit

Permalink
HTCondor Memory TRSH
Browse files Browse the repository at this point in the history
If HTCONDOR (ATLAS) dismisses a job due to not using 20% of requested memory within the first hour, job will be troubleshooted with less memory and resubmitted
  • Loading branch information
calvinp0 committed Dec 27, 2024
1 parent e64f3ec commit af7d53b
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions arc/job/trsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,26 @@ def trsh_ess_job(label: str,
logger.info(f'Troubleshooting {job_type} job in {software} for {label} using more memory: {memory} GB '
f'instead of {memory_gb} GB')
ess_trsh_methods.append('memory')
elif 'Memory' in job_status['keywords'] and 'too high' in job_status['error'] and server is not None:
# Reduce memory allocation by 80%, rounded to the nearest 5 GB increment
couldnt_trsh = False
proposed_memory = round(memory_gb * 0.8 / 5) * 5 # Round to the nearest 5 GB increment
reduced_memory = max(2, proposed_memory) # Ensure a minimum of 2 GB

# Ensure reduced_memory is strictly less than current memory_gb
if reduced_memory >= memory_gb:
reduced_memory = max(2, memory_gb - 1)
logger.info(f'Troubleshooting {job_type} job in {software} for {label} using less memory: {reduced_memory} GB '
f'instead of {memory_gb} GB')

# Check for existing 'waste_memory_' entries and calculate next reduction level
if f'waste_memory_{reduced_memory}' not in ess_trsh_methods:
ess_trsh_methods.append(f'waste_memory_{reduced_memory}')
memory = reduced_memory # Update memory to the reduced value for next iteration
else:
couldnt_trsh = True
logger.info(f'{logger_phrase} was unsuccessful. No further reductions possible without reaching threshold.')


if attempted_ess_trsh_methods:
if attempted_ess_trsh_methods == ess_trsh_methods:
Expand Down

0 comments on commit af7d53b

Please sign in to comment.