Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ort inputs filtering #129

Merged
merged 3 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ data/
version.txt

.engine/
actions-runner-duplicate/
actions-runner/
experiments/
amdsmi/
16 changes: 11 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
# List of targets that are not associated with files
.PHONY: quality style install \
build_docker_cpu, build_docker_cuda, build_docker_rocm, \
test_cli_cpu_pytorch, test_cli_rocm_pytorch, \
test_cli_cpu_neural_compressor, test_cli_cpu_onnxruntime, test_cli_cpu_openvino, \
test_api_cpu, test_api_cuda, test_api_rocm, test_api_misc
.PHONY: quality style install build_docker_cpu build_docker_cuda build_docker_rocm test_cli_cpu_neural_compressor test_cli_cpu_onnxruntime test_cli_cpu_openvino test_cli_cpu_pytorch test_cli_rocm_pytorch test_cli_cuda_pytorch test_api_cpu test_api_cuda test_api_rocm test_api_misc

quality:
ruff check .
Expand All @@ -28,6 +24,7 @@ build_docker_rocm:
test_cli_cpu_neural_compressor:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -36,6 +33,7 @@ test_cli_cpu_neural_compressor:
test_cli_cpu_onnxruntime:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -44,6 +42,7 @@ test_cli_cpu_onnxruntime:
test_cli_cpu_openvino:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -52,6 +51,7 @@ test_cli_cpu_openvino:
test_cli_cpu_pytorch:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -60,6 +60,7 @@ test_cli_cpu_pytorch:
test_cli_rocm_pytorch:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -72,6 +73,7 @@ test_cli_rocm_pytorch:
test_cli_cuda_pytorch:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -81,6 +83,7 @@ test_cli_cuda_pytorch:
test_api_cpu:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -89,6 +92,7 @@ test_api_cpu:
test_api_cuda:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -98,6 +102,7 @@ test_api_cuda:
test_api_rocm:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -110,6 +115,7 @@ test_api_rocm:
test_api_misc:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand Down
4 changes: 2 additions & 2 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ..task_utils import get_automodel_class_for_task
from .config import BackendConfigT
from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config
from .transformers_utils import (
PretrainedProcessor,
Expand Down Expand Up @@ -41,7 +41,7 @@ def __init__(self, config: BackendConfigT):

if self.config.library == "diffusers":
self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.hub_kwargs)
self.model_type = self.config.task
self.generation_config = None
self.pre_processor = None
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/diffusers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
return diffusers.DiffusionPipeline.load_config(model, **kwargs)


def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]:
def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
config = diffusers.DiffusionPipeline.load_config(model, **kwargs)

shapes = {}
Expand Down
3 changes: 2 additions & 1 deletion optimum_benchmark/backends/onnxruntime/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,14 @@ def prepare_for_inference(self, **kwargs) -> None:

def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
if self.config.library == "diffusers":
return {"prompt": inputs["prompt"]}
return inputs

LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}")
for key, value in list(inputs.items()):
if key in self.inputs_names:
inputs[key] = value.to(self.config.device)
else:
LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.")
inputs.pop(key)

return inputs
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained
try:
# sometimes contains information about the model's input shapes that are not available in the config
return AutoProcessor.from_pretrained(model, **kwargs)
except ValueError:
except Exception:
return None


Expand Down
54 changes: 28 additions & 26 deletions optimum_benchmark/benchmarks/inference/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..base import Benchmark
from ..report import BenchmarkMeasurements, BenchmarkReport
from .config import InferenceConfig
from .inputs_utils import extract_text_generation_inputs

if is_torch_distributed_available():
import torch.distributed
Expand Down Expand Up @@ -72,37 +73,35 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
"The batch size must be divisible by the number of processes in a distributed environment"
)
self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS:
TEXT_GENERATION_TASKS["synced_gpus"] = True

LOGGER.info("\t+ Creating input generator")
self.input_generator = InputGenerator(
task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes
)

if backend.config.task in TEXT_GENERATION_TASKS:
LOGGER.info("\t+ Generating and preparing Text Generation input")
self.forward_inputs = self.input_generator(mode="forward")
self.generate_input = self.input_generator(mode="generate")
LOGGER.info("\t+ Generating and preparing Text Generation inputs")
self.forward_inputs = self.input_generator()
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
self.generate_input = backend.prepare_inputs(self.generate_input)
self.generate_inputs = extract_text_generation_inputs(self.forward_inputs)
LOGGER.info("\t+ Updating Text Generation kwargs with default values")
self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
LOGGER.info("\t+ Initializing Text Generation report")
self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements())

elif backend.config.task in IMAGE_DIFFUSION_TASKS:
LOGGER.info("\t+ Generating and preparing Image Diffusion input")
self.diffuse_input = self.input_generator(mode="call")
self.diffuse_input = backend.prepare_inputs(self.diffuse_input)
LOGGER.info("\t+ Generating Image Diffusion inputs")
self.call_inputs = self.input_generator()
self.call_inputs = backend.prepare_inputs(self.call_inputs)
self.call_inputs = {"prompt": self.call_inputs["prompt"]}
LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
self.config.call_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.call_kwargs}
LOGGER.info("\t+ Initializing Image Diffusion report")
self.report = ImageDiffusionReport(call=BenchmarkMeasurements())

else:
LOGGER.info("\t+ Generating and preparing Inference input")
self.forward_inputs = self.input_generator(mode="forward")
LOGGER.info("\t+ Generating and preparing Inference inputs")
self.forward_inputs = self.input_generator()
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
LOGGER.info("\t+ Initializing Inference report")
self.report = InferenceReport(forward=BenchmarkMeasurements())
Expand All @@ -111,16 +110,17 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
backend.prepare_for_inference(
**backend.model_shapes,
**self.config.input_shapes,
**self.config.forward_kwargs,
**self.config.generate_kwargs,
**self.config.forward_kwargs,
**self.config.call_kwargs,
)

LOGGER.info("\t+ Warming up backend for Inference")
for _ in range(self.config.warmup_runs):
if backend.config.task in TEXT_GENERATION_TASKS:
_ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2})
_ = backend.generate(self.generate_inputs, {"max_new_tokens": 2, "min_new_tokens": 2})
elif backend.config.task in IMAGE_DIFFUSION_TASKS:
_ = backend.call(self.diffuse_input, {"num_inference_steps": 2})
_ = backend.call(self.call_inputs, {"num_inference_steps": 2})
else:
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)

Expand Down Expand Up @@ -164,8 +164,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
self.report.log_energy()
self.report.log_efficiency()

self.report.log()

## Memory tracking
def run_text_generation_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
Expand All @@ -177,15 +175,15 @@ def run_text_generation_memory_tracking(self, backend: Backend):

self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.memory = self.memory_tracker.get_max_memory()

def run_image_diffusion_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.memory = self.memory_tracker.get_max_memory()

Expand All @@ -205,17 +203,21 @@ def run_text_generation_latency_tracking(self, backend: Backend):
with self.latency_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)

self.report.prefill.latency = self.latency_tracker.get_latency()
forward_latency = self.latency_tracker.get_latency()
forward_latency.log(prefix="forward")
self.report.prefill.latency = forward_latency
self.report.prefill.throughput = self.latency_tracker.get_throughput(
volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
)

self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean
generate_latency = self.latency_tracker.get_latency()
generate_latency.log(prefix="generate")
self.report.decode.latency = generate_latency - self.report.prefill.latency.mean
self.report.decode.throughput = Throughput.from_latency(
self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT
)
Expand All @@ -225,7 +227,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend):
self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.latency = self.latency_tracker.get_latency()
self.report.call.throughput = Throughput.from_latency(
Expand Down Expand Up @@ -258,7 +260,7 @@ def run_text_generation_energy_tracking(self, backend: Backend):

self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy
self.report.decode.efficiency = Efficiency.from_energy(
Expand All @@ -269,7 +271,7 @@ def run_image_diffusion_energy_tracking(self, backend: Backend):
LOGGER.info("\t+ Running energy tracking")
self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.energy = self.energy_tracker.get_energy()
self.report.call.efficiency = Efficiency.from_energy(
Expand Down Expand Up @@ -297,7 +299,7 @@ def prefill_volume(self) -> int: # in tokens

@property
def call_volume(self) -> int: # in images
return self.config.input_shapes["batch_size"] * self.config.forward_kwargs["num_images_per_prompt"]
return self.config.input_shapes["batch_size"] * self.config.call_kwargs["num_images_per_prompt"]

@property
def decode_volume(self) -> int: # in tokens
Expand Down
17 changes: 17 additions & 0 deletions optimum_benchmark/benchmarks/inference/inputs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def extract_text_generation_inputs(inputs):
if "pixel_values" in inputs:
# image input
text_generation_inputs = {"inputs": inputs["pixel_values"]}
elif "input_values" in inputs:
# speech input
text_generation_inputs = {"inputs": inputs["input_values"]}
elif "input_features" in inputs:
# waveform input
text_generation_inputs = {"inputs": inputs["input_features"]}
elif "input_ids" in inputs:
# text input
text_generation_inputs = {"inputs": inputs["input_ids"]}
else:
raise ValueError("Could not find any valid text generation inputs.")

return text_generation_inputs
19 changes: 1 addition & 18 deletions optimum_benchmark/generators/input_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,6 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s
"please submit a PR or a feature request to optimum-benchmark. "
)

def __call__(self, mode: str) -> Dict[str, Any]:
def __call__(self) -> Dict[str, Any]:
task_input = self.task_generator()

if mode == "generate":
if "pixel_values" in task_input:
# image input
task_input = {"inputs": task_input["pixel_values"]}
elif "input_values" in task_input:
# speech input
task_input = {"inputs": task_input["input_values"]}
elif "input_features" in task_input:
# waveform input
task_input = {"inputs": task_input["input_features"]}
elif "input_ids" in task_input:
# text input
task_input = {"inputs": task_input["input_ids"]}
elif mode == "call":
task_input = {"prompt": task_input["prompt"]}

return task_input
14 changes: 4 additions & 10 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,13 @@ def _cpu_latency(self):
self.end_events.append(end)

def get_elapsed_time(self) -> float:
# we measured in cpu to not synchronize all events
# we measure it in cpu to not synchronize all events
return time.perf_counter() - self.start_time

def get_latency(self) -> Latency:
if self.backend == "pytorch" and self.device == "cuda":
# synchronize the last event to make sure it has been recorded
self.start_events[-1].synchronize()
self.end_events[-1].synchronize()
# synchronize the device to make sure all events have been recorded
torch.cuda.synchronize()

latencies_list = [
self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events))
Expand Down Expand Up @@ -210,12 +209,7 @@ def __init__(self, device: str, backend: str):
self.reset()

def reset(self):
if self.device == "cuda" and self.backend == "pytorch":
event = torch.cuda.Event(enable_timing=True)
event.record()
self.events = [event]
else:
self.events = [time.perf_counter()]
self.events: List[Union[float, torch.cuda.Event]] = []

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
if self.device == "cuda" and self.backend == "pytorch":
Expand Down
Loading
Loading