Skip to content

Commit

Permalink
Crude profiling of time waiting for the multiprocessing queue
Browse files Browse the repository at this point in the history
A semicolon separated csv format can be extracted from logs using the
marker string "QUEUE_PERF".
Both the semaphore and the put/get are measured, for both producer and
consumer.
  • Loading branch information
Waino committed May 20, 2024
1 parent 8475e37 commit 4cc202a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
7 changes: 7 additions & 0 deletions mammoth/distributed/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pickle
import signal
import time

import torch
import torch.distributed
Expand Down Expand Up @@ -213,11 +214,17 @@ def batch_producer(generator_to_serve, queue, semaphore, opts, device_id):
logger.info(generator_to_serve)

for batch, metadata, communication_batch_id in generator_to_serve:
start = time.time()
semaphore.acquire()
duration = time.time() - start
logger.warning(f'QUEUE_PERF;producer_semaphore;{duration}')
# Move batch to correspond device_id when consumer iterate
# hack to dodge unpicklable `dict_keys`
# batch.fields = list(batch.fields)
start = time.time()
queue.put((batch, metadata, communication_batch_id))
duration = time.time() - start
logger.warning(f'QUEUE_PERF;producer_put;{duration}')


def consumer(process_fn, opts, device_context, error_queue, batch_queue, semaphore, task_queue_manager, checkpoint):
Expand Down
6 changes: 6 additions & 0 deletions mammoth/train_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,14 @@ def main(

def _train_iter():
while True:
start = time.time()
batch, metadata, communication_batch_id = batch_queue.get()
duration = time.time() - start
logger.warning(f'QUEUE_PERF;consumer_get;{duration}')
start = time.time()
semaphore.release()
duration = time.time() - start
logger.warning(f'QUEUE_PERF;consumer_semaphore;{duration}')
# TODO: confirm that batch-providing corpus has already been to'd to the correct place
yield batch, metadata, communication_batch_id

Expand Down

0 comments on commit 4cc202a

Please sign in to comment.