Skip to content

Commit

Permalink
Feat: reimplement vllm backend beam search using logprobs (#293)
Browse files Browse the repository at this point in the history
Co-authored-by: Ilyas Moutawwakil <[email protected]>
  • Loading branch information
vicoooo26 and IlyasMoutawwakil authored Nov 25, 2024
1 parent 31aa662 commit 1c20082
Showing 1 changed file with 18 additions and 20 deletions.
38 changes: 18 additions & 20 deletions optimum_benchmark/backends/vllm/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,35 +117,33 @@ def batch_offline_engine_generate(self, inputs: Dict[str, Any], kwargs: Dict[str
self.pretrained_model.add_request(
inputs=prompt,
request_id=str(i),
params=SamplingParams(
ignore_eos=True,
detokenize=True,
seed=self.config.seed,
n=kwargs.get("num_return_sequences"),
max_tokens=kwargs.get("max_new_tokens"),
min_tokens=kwargs.get("min_new_tokens"),
use_beam_search=kwargs.get("num_beams") > 1,
logits_processors=kwargs.get("logits_processors", None),
),
params=self.get_sampling_params(kwargs),
)

while self.pretrained_model.has_unfinished_requests():
self.pretrained_model.step()

def get_sampling_params(self, kwargs: Dict[str, Any]) -> SamplingParams:
params = SamplingParams(
ignore_eos=True,
detokenize=True,
seed=self.config.seed,
n=kwargs.get("num_return_sequences"),
max_tokens=kwargs.get("max_new_tokens"),
min_tokens=kwargs.get("min_new_tokens"),
logits_processors=kwargs.get("logits_processors", None),
)
# following huggingface transformers implementation
# https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/generation/beam_search.py#L534
if kwargs.get("num_beams") > 1:
params.logprobs = 2 * kwargs.get("num_beams")
return params

async def single_online_engine_generate(self, prompt: str, request_id: str, kwargs: Dict[str, Any]) -> Any:
stream = await self.pretrained_model.add_request(
inputs=prompt,
request_id=request_id,
params=SamplingParams(
ignore_eos=True,
detokenize=True,
seed=self.config.seed,
n=kwargs.get("num_return_sequences"),
max_tokens=kwargs.get("max_new_tokens"),
min_tokens=kwargs.get("min_new_tokens"),
use_beam_search=kwargs.get("num_beams") > 1,
logits_processors=kwargs.get("logits_processors", None),
),
params=self.get_sampling_params(kwargs),
)

async for _ in stream:
Expand Down

0 comments on commit 1c20082

Please sign in to comment.