Skip to content

Commit

Permalink
Add buffer in the maximum number of tokens generated (to fix #353) (#354
Browse files Browse the repository at this point in the history
)

* Add buffer in the maximum number of tokens generated

* Add the token_buffer consistently in all subclasses
  • Loading branch information
viswavi authored Sep 20, 2023
1 parent c39b68a commit b01a7f8
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
20 changes: 16 additions & 4 deletions prompt2model/utils/api_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def generate_one_completion(
temperature: float = 0,
presence_penalty: float = 0,
frequency_penalty: float = 0,
token_buffer: int = 300,
) -> openai.Completion:
"""Generate a chat completion using an API-based model.
Expand All @@ -86,16 +87,21 @@ def generate_one_completion(
frequency_penalty: Float between -2.0 and 2.0. Positive values penalize new
tokens based on their existing frequency in the text so far, decreasing
the model's likelihood of repeating the same line verbatim.
token_buffer: Number of tokens below the LLM's limit to generate. In case
our tokenizer does not exactly match the LLM API service's perceived
number of tokens, this prevents service errors. On the other hand, this
may lead to generating fewer tokens in the completion than is actually
possible.
Returns:
An OpenAI-like response object if there were no errors in generation.
In case of API-specific error, Exception object is captured and returned.
"""
num_prompt_tokens = count_tokens_from_string(prompt)
if self.max_tokens:
max_tokens = self.max_tokens - num_prompt_tokens
max_tokens = self.max_tokens - num_prompt_tokens - token_buffer
else:
max_tokens = 4 * num_prompt_tokens
max_tokens = 3 * num_prompt_tokens

response = completion( # completion gets the key from os.getenv
model=self.model_name,
Expand All @@ -116,6 +122,7 @@ async def generate_batch_completion(
temperature: float = 1,
responses_per_request: int = 5,
requests_per_minute: int = 80,
token_buffer: int = 300,
) -> list[openai.Completion]:
"""Generate a batch responses from OpenAI Chat Completion API.
Expand All @@ -126,6 +133,11 @@ async def generate_batch_completion(
responses_per_request: Number of responses for each request.
i.e. the parameter n of API call.
requests_per_minute: Number of requests per minute to allow.
token_buffer: Number of tokens below the LLM's limit to generate. In case
our tokenizer does not exactly match the LLM API service's perceived
number of tokens, this prevents service errors. On the other hand, this
may lead to generating fewer tokens in the completion than is actually
possible.
Returns:
List of generated responses.
Expand Down Expand Up @@ -183,9 +195,9 @@ async def _throttled_completion_acreate(

num_prompt_tokens = max(count_tokens_from_string(prompt) for prompt in prompts)
if self.max_tokens:
max_tokens = self.max_tokens - num_prompt_tokens
max_tokens = self.max_tokens - num_prompt_tokens - token_buffer
else:
max_tokens = 4 * num_prompt_tokens
max_tokens = 3 * num_prompt_tokens

async_responses = [
_throttled_completion_acreate(
Expand Down
2 changes: 2 additions & 0 deletions test_helpers/mock_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def generate_one_completion(
temperature: float = 0,
presence_penalty: float = 0,
frequency_penalty: float = 0,
token_buffer: int = 300,
) -> openai.Completion:
"""Return a mocked object and increment the counter."""
self.generate_one_call_counter += 1
Expand All @@ -207,6 +208,7 @@ async def generate_batch_completion(
temperature: float = 1,
responses_per_request: int = 5,
requests_per_minute: int = 80,
token_buffer: int = 300,
) -> list[openai.Completion]:
"""Return a mocked object and increment the counter."""
self.generate_batch_call_counter += 1
Expand Down

0 comments on commit b01a7f8

Please sign in to comment.