Skip to content

Commit

Permalink
Parameter --time_budget_s to end gpu assignment early
Browse files Browse the repository at this point in the history
  • Loading branch information
Waino committed Dec 19, 2023
1 parent 6bb5434 commit 21009ef
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
5 changes: 5 additions & 0 deletions tools/config_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ def add_allocate_device_args(parser):
parser.add_argument('--n_gpus_per_node', type=int)
parser.add_argument('--n_slots_per_gpu', type=int)
parser.add_argument('--log_name', type=str)
parser.add_argument(
'--time_budget_s', type=int,
help='time budget for GPU assignment, in seconds',
)


def add_set_transforms_args(parser):
Expand Down Expand Up @@ -533,6 +537,7 @@ def allocate_devices(opts):
lang_to_group_mapping=cc_opts['groups'],
lps_ready_to_start=lps_ready_to_start,
log_name=opts.log_name,
time_budget_s=opts.time_budget_s,
)

for gpu_slot, lp in assignment.items():
Expand Down
19 changes: 17 additions & 2 deletions tools/gpu_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,13 @@ def swap_all_slots_once(self, assignment, current_cost, slot_subset=None):
slot_subset = self.gpu_slots if slot_subset is None else slot_subset
for i, slot_a in enumerate(tqdm(slot_subset, desc='swap_all_slots_once', leave=False)):
current_cost, assignment = self.best_swap_for(slot_a, assignment, current_cost, slot_subset)
if self.deadline and time.time() > self.deadline:
print('Time budget exceeded, finishing early mid-iteration', flush=True)
break
return current_cost, assignment

def optimize(self, assignment, current_cost, iterations=10, patience=1):
def optimize(self, assignment, current_cost, iterations=10, patience=1, time_budget_s=None):
self.deadline = time.time() + time_budget_s if time_budget_s else None
prev_cost = None
stalled = 0
print(f'initial cost: {current_cost}', flush=True)
Expand All @@ -376,6 +380,9 @@ def optimize(self, assignment, current_cost, iterations=10, patience=1):
current_cost,
slot_subset
)
if self.deadline and time.time() > self.deadline:
print('Time budget exceeded, finishing early', flush=True)
break
print(f'\niteration {i} least_favorite cost: {current_cost}', flush=True)
# Random subsets
slot_subsets = self.slot_subsets(self.gpu_slots, n=100)
Expand All @@ -393,6 +400,9 @@ def optimize(self, assignment, current_cost, iterations=10, patience=1):
if stalled > patience:
print('No improvement, finishing early', flush=True)
break
if self.deadline and time.time() > self.deadline:
print('Time budget exceeded, finishing early', flush=True)
break
return current_cost, assignment, i

def slot_subsets(self, slots, n=100):
Expand Down Expand Up @@ -435,6 +445,7 @@ def optimize_gpu_assignment(
lang_to_group_mapping: Dict[str, str],
lps_ready_to_start: Optional[Set[Tuple[str, str]]],
log_name: Optional[str] = None,
time_budget_s: Optional[int] = None,
):
optimizer = AssignmentOptimizer(
n_nodes=n_nodes,
Expand All @@ -447,7 +458,11 @@ def optimize_gpu_assignment(
initial = optimizer.initial_assignment(lang_pairs)
initial_cost = optimizer.cost(initial)
start = time.time()
best_cost, assignment, iterations = optimizer.optimize(initial, initial_cost)
best_cost, assignment, iterations = optimizer.optimize(
initial,
initial_cost,
time_budget_s=time_budget_s
)
duration_s = time.time() - start
print_assignment(assignment, lang_to_group_mapping, ready_to_start=lps_ready_to_start)
print(f'assignment cost {best_cost}', flush=True)
Expand Down

0 comments on commit 21009ef

Please sign in to comment.