diff --git a/evals/elsuite/bluff/strategy_solver.py b/evals/elsuite/bluff/strategy_solver.py index 0623b9727f..7a2029d303 100644 --- a/evals/elsuite/bluff/strategy_solver.py +++ b/evals/elsuite/bluff/strategy_solver.py @@ -1,9 +1,11 @@ +import copy import re from importlib import import_module from typing import Optional from evals.elsuite.bluff.bluff.cards import get_bluff_move from evals.solvers.solver import Solver, SolverResult +from evals.solvers.utils import PersistentMemoryCache from evals.task_state import Message, TaskState @@ -24,7 +26,23 @@ def __init__( self.max_attempts = max_attempts self.rethink_strategy_after = rethink_strategy_after - def __call__(self, task_state: TaskState): + # interaction_length=1 to store reasoning step in private memory + self.interaction_cache = PersistentMemoryCache(interaction_length=1) + + def _generate_response(self, task_state: TaskState): + """ + Calls base solver. Modifies taks state to remove all non-reasoning messages + from assistant + """ + task_state = copy.deepcopy(task_state) + task_state.messages = [ + msg + for msg in task_state.messages + if msg.role != "assistant" or msg.content.startswith("{") or len(msg.content) > 20 + ] + return self.base_solver(task_state).output + + def _solve(self, task_state: TaskState): """ This solver does three things that should help the model play better: 1. Adds a strategy guide as the first message (just after the task description) @@ -35,25 +53,12 @@ def __call__(self, task_state: TaskState): # GENERAL NOTE. # This function is pretty ugly. I'm not sure how to implement this better. We decided this is good enough. - # Remove assistant messages added by the main solver (i.e. non-JSON). - # We need len(msg.content) > 20 because we don't want to remove "rething startegy". - task_state.messages = [ - msg - for msg in task_state.messages - if msg.role != "assistant" or msg.content.startswith("{") or len(msg.content) > 20 - ] + # Before the first move in a game - strategy guide goes first + strategy_msg = Message("system", strategy) + task_state.messages.insert(0, strategy_msg) + task_state.messages = self.interaction_cache.load_private_interaction(task_state) game = task_state.current_state - - if len(game.rounds) == 1 and len(game.rounds[0].moves) < 2: - # Before the first move in a game - strategy guide goes first - strategy_msg = Message("system", strategy) - - # This if is important - we might have already tried - # to bid, but gave an invalid bid, so still we have no moves - if strategy_msg not in task_state.messages: - task_state.messages.insert(0, strategy_msg) - if ( self.rethink_strategy_after is not None and len(game.rounds) == 1 + self.rethink_strategy_after @@ -67,15 +72,32 @@ def __call__(self, task_state: TaskState): if strategy_update_msg not in task_state.messages: last_system_message = task_state.messages.pop() task_state.messages.append(strategy_update_msg) - response = self.base_solver(task_state).output + response = self._generate_response(task_state) task_state.messages.append(Message("assistant", response)) task_state.messages.append(last_system_message) + # Manually update interaction cache, since we re-order messages + last_interaction = self.interaction_cache.last_interaction + last_interaction_messages = last_interaction.messages[:-1] + [ + Message("system", strategy_update_msg), + Message("assistant", response), + Message("system", last_system_message), + ] + last_interaction_private_ids = last_interaction.private_messages_ids + [ + len(task_state.messages) - 3, + len(task_state.messages) - 2, + ] + + self.interaction_cache.last_interaction.messages = last_interaction_messages + self.interaction_cache.last_interaction.private_messages_ids = ( + last_interaction_private_ids + ) + # If this move_str is preserved, the game engine will have to deal with that # (and it has some way of solving this problem) move_str = "[INVALID MOVE]" for _ in range(self.max_attempts): - response = self.base_solver(task_state).output + response = self._generate_response(task_state) try: move_str = self._parse_response(response) # This will raise ValueError if this is not a valid move @@ -85,6 +107,9 @@ def __call__(self, task_state: TaskState): pass task_state.messages.append(Message("assistant", response)) + task_state.messages.append(Message("assistant", move_str)) + self.interaction_cache.save_private_interaction(task_state) + return SolverResult(move_str) @property diff --git a/evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py b/evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py index 2daf67472e..0f3c6c42b8 100644 --- a/evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py +++ b/evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py @@ -6,6 +6,7 @@ EXTRACT_MESSAGE, TIME_REMINDER_MSG, ) +from evals.solvers.nested.cot_solver import CoTSolver from evals.solvers.solver import Solver, SolverResult from evals.task_state import Message, TaskState @@ -28,7 +29,7 @@ def __init__( # Add more args to base solver if we're using CoT base_solver_args_extra = {} - if class_name == "OpenAIChatCompletionCoTSolver": + if issubclass(cls, CoTSolver): base_solver_args_extra = { "cot_template": COT_MESSAGE, "extract_template": EXTRACT_MESSAGE, @@ -49,7 +50,7 @@ def __init__( def name(self): return "Scaffolded-LM-Solver" - def __call__(self, task_state: TaskState, **kwargs) -> SolverResult: + def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: # Optional additional message for better LM capabilities. Only append if # this is start of conversaiton, otherwise this is included in memory if self.lm_system_prompt: diff --git a/evals/elsuite/sandbagging/solvers.py b/evals/elsuite/sandbagging/solvers.py index 21099238b8..8ce27bc3f3 100644 --- a/evals/elsuite/sandbagging/solvers.py +++ b/evals/elsuite/sandbagging/solvers.py @@ -126,7 +126,7 @@ def _construct_prompt(self, task_state: TaskState) -> Sequence[Dict]: return prompt - def __call__(self, task_state: TaskState, **kwargs) -> (Sequence[Dict], SolverResult): + def _solve(self, task_state: TaskState, **kwargs) -> (Sequence[Dict], SolverResult): prompt = self._construct_prompt(task_state) result = self._predict_answer(prompt, **kwargs) diff --git a/evals/elsuite/self_prompting/solvers/baselines.py b/evals/elsuite/self_prompting/solvers/baselines.py index 5aea250905..4710088494 100644 --- a/evals/elsuite/self_prompting/solvers/baselines.py +++ b/evals/elsuite/self_prompting/solvers/baselines.py @@ -11,7 +11,7 @@ def __init__( This solver simply returns an empty string as the prompt. """ - def __call__( + def _solve( self, task_state: TaskState, **kwargs, @@ -32,7 +32,7 @@ def __init__( This solver simply returns the original instruction as the prompt. """ - def __call__( + def _solve( self, task_state: TaskState, **kwargs, @@ -54,7 +54,7 @@ def __init__( This solver concatenates the given input-output examples as few-shot demonstrations. """ - def __call__( + def _solve( self, task_state: TaskState, **kwargs, diff --git a/evals/elsuite/self_prompting/solvers/custom_cot_solver.py b/evals/elsuite/self_prompting/solvers/custom_cot_solver.py index c75146518f..465f7f5e4d 100644 --- a/evals/elsuite/self_prompting/solvers/custom_cot_solver.py +++ b/evals/elsuite/self_prompting/solvers/custom_cot_solver.py @@ -36,7 +36,7 @@ def __init__( self.extract_completion_fn = OpenAIChatCompletionFn(**self.completion_fn_options) self.extract_template = extract_template - def __call__( + def _solve( self, task_state: TaskState, **kwargs, diff --git a/evals/registry.py b/evals/registry.py index 89557c1cf0..cb37791cbc 100644 --- a/evals/registry.py +++ b/evals/registry.py @@ -60,6 +60,7 @@ def n_ctx_from_model_name(model_name: str) -> Optional[int]: "gpt-4": 8192, "gpt-4-32k": 32768, "gpt-4-base": 8192, + "gpt-4-1106-preview": 128_000, } # first, look for an exact match @@ -135,7 +136,7 @@ def make_completion_fn( # No match, so try to find a completion-fn-id in the registry spec = self.get_completion_fn(name) if spec is None: - raise ValueError(f"Could not find CompletionFn in the registry with ID {name}") + raise ValueError(f"Could not find CompletionFn/Solver in the registry with ID {name}") if spec.args is None: spec.args = {} spec.args.update(kwargs) @@ -195,7 +196,7 @@ def get_modelgraded_spec(self, name: str, **kwargs: dict) -> Optional[ModelGrade ) def get_completion_fn(self, name: str) -> Optional[CompletionFnSpec]: - return self._dereference(name, self._completion_fns, "completion_fn", CompletionFnSpec) + return self._dereference(name, self._completion_fns | self._solvers, "completion_fn", CompletionFnSpec) def get_eval(self, name: str) -> Optional[EvalSpec]: return self._dereference(name, self._evals, "eval", EvalSpec) @@ -303,6 +304,10 @@ def _load_registry(self, registry_paths: Sequence[Path], resource_type: str) -> def _completion_fns(self) -> RawRegistry: return self._load_registry(self._registry_paths, "completion_fns") + @functools.cached_property + def _solvers(self) -> RawRegistry: + return self._load_registry(self._registry_paths, "solvers") + @functools.cached_property def _eval_sets(self) -> RawRegistry: return self._load_registry(self._registry_paths, "eval_sets") diff --git a/evals/registry/completion_fns/bluff.yaml b/evals/registry/completion_fns/bluff.yaml deleted file mode 100644 index 6a89d37322..0000000000 --- a/evals/registry/completion_fns/bluff.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# Recommended strategic solvers -bluff/strategy/gpt-4: - class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver - args: - base_solver_class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - base_solver_args: - completion_fn_options: - model: gpt-4 - extra_options: - temperature: 0 - -bluff/strategy/gpt-3.5-turbo-16k: - class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver - args: - base_solver_class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - base_solver_args: - completion_fn_options: - model: gpt-3.5-turbo-16k - extra_options: - temperature: 0 - -bluff/strategy/gpt-4-base: - class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver - args: - base_solver_class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver - base_solver_args: - completion_fn_options: - model: gpt-4-base - extra_options: - temperature: 0 - max_tokens: 512 - -bluff/strategy/code-davinci-002: - class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver - args: - base_solver_class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver - base_solver_args: - completion_fn_options: - model: code-davinci-002 - extra_options: - temperature: 0 - max_tokens: 512 - -# Other solvers. They are just base models, but with temperature set to 0. -bluff/gpt-4: - class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - args: - completion_fn_options: - model: gpt-4 - extra_options: - temperature: 0 - -bluff/gpt-3.5-turbo-16k: - class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - args: - completion_fn_options: - model: gpt-3.5-turbo-16k - extra_options: - temperature: 0 - -bluff/gpt-4-base: - class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver - args: - completion_fn_options: - model: gpt-4-base - extra_options: - temperature: 0 - max_tokens: 512 - -bluff/code-davinci-002: - class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver - args: - completion_fn_options: - model: code-davinci-002 - extra_options: - temperature: 0 - max_tokens: 512 diff --git a/evals/registry/completion_fns/make-me-pay.yaml b/evals/registry/completion_fns/make-me-pay.yaml deleted file mode 100644 index a9383100da..0000000000 --- a/evals/registry/completion_fns/make-me-pay.yaml +++ /dev/null @@ -1,92 +0,0 @@ -make-me-pay/baseline/gpt-3.5-turbo-16k: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - base_solver_args: - completion_fn_options: - model: gpt-3.5-turbo-16k - extra_options: - temperature: 1.0 - max_tokens: 1000 - -make-me-pay/baseline/gpt-4: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - base_solver_args: - completion_fn_options: - model: gpt-4 - extra_options: - temperature: 1.0 - max_tokens: 1000 - -make-me-pay/baseline/gpt-4-32k: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver - base_solver_args: - completion_fn_options: - model: gpt-4-32k - extra_options: - temperature: 1.0 - max_tokens: 1000 - -make-me-pay/gpt-3.5-turbo-16k: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_cot_solver:OpenAIChatCompletionCoTSolver - base_solver_args: - cot_options: - model: gpt-3.5-turbo-16k - extra_options: - temperature: 1.0 - max_tokens: 1000 - extract_options: - model: gpt-3.5-turbo-16k - extra_options: - temperature: 1.0 - max_tokens: 1000 - persistent_memory: True - lm_system_prompt: True - time_reminder: True - -make-me-pay/gpt-4: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_cot_solver:OpenAIChatCompletionCoTSolver - base_solver_args: - cot_options: - model: gpt-4 - extra_options: - temperature: 1.0 - max_tokens: 1000 - extract_options: - model: gpt-4 - extra_options: - temperature: 1.0 - max_tokens: 1000 - persistent_memory: True - lm_system_prompt: True - time_reminder: True - -make-me-pay/gpt-4-32k: - class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver - args: - base_solver_class: evals.solvers.openai_chat_completion_cot_solver:OpenAIChatCompletionCoTSolver - base_solver_args: - cot_options: - model: gpt-4-32k - extra_options: - temperature: 1.0 - max_tokens: 1000 - extract_options: - model: gpt-4-32k - extra_options: - temperature: 1.0 - max_tokens: 1000 - persistent_memory: True - lm_system_prompt: True - time_reminder: True - -make-me-pay/cli: - class: evals.solvers.human_cli_solver:HumanCliSolver diff --git a/evals/registry/solvers/bluff.yaml b/evals/registry/solvers/bluff.yaml new file mode 100644 index 0000000000..4d8e0e6610 --- /dev/null +++ b/evals/registry/solvers/bluff.yaml @@ -0,0 +1,89 @@ +# Recommended strategic solvers +bluff/strategy/gpt-4: + class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver + args: + base_solver_class: evals.solvers.openai_solver:OpenAISolver + base_solver_args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + +bluff/strategy/gpt-3.5-turbo-16k: + class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver + args: + base_solver_class: evals.solvers.openai_solver:OpenAISolver + base_solver_args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 0 + +bluff/strategy/gpt-4-base: + class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver + args: + base_solver_class: evals.solvers.nested.hhh_solver:HHHSolver + base_solver_args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 512 + +bluff/strategy/code-davinci-002: + class: evals.elsuite.bluff.strategy_solver:BluffStrategySolver + args: + base_solver_class: evals.solvers.nested.hhh_solver:HHHSolver + base_solver_args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 512 + +# Other solvers. They are just base models, but with temperature set to 0. +bluff/gpt-4: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + +bluff/gpt-3.5-turbo-16k: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 0 + +bluff/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 512 + +bluff/code-davinci-002: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 512 diff --git a/evals/registry/solvers/defaults.yaml b/evals/registry/solvers/defaults.yaml new file mode 100644 index 0000000000..37aac1f3d4 --- /dev/null +++ b/evals/registry/solvers/defaults.yaml @@ -0,0 +1,603 @@ + +# ------------------ +# Human input +# ------------------ + +human_cli: + class: evals.solvers.human_cli_solver:HumanCliSolver + + +# ------------------ +# gpt-3.5-turbo +# ------------------ + +# generation tasks + +generation/direct/gpt-3.5-turbo: &generation_chatmodel + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot/gpt-3.5-turbo: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/direct/gpt-3.5-turbo: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot/gpt-3.5-turbo: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +# ------------------ +# gpt-3.5-turbo-16k +# ------------------ + +# generation tasks + +generation/direct/gpt-3.5-turbo-16k: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot/gpt-3.5-turbo-16k: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/direct/gpt-3.5-turbo-16k: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot/gpt-3.5-turbo-16k: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +# ------------------ +# code-davinci-002 +# ------------------ + +# generation tasks + +generation/hhh/code-davinci-002: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot_hhh/code-davinci-002: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/hhh/code-davinci-002: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot_hhh/code-davinci-002: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +# ------------------ +# gpt-4 +# ------------------ + +# TODO: Can we reuse most of these configs and just replace the parts we need? +# Ideally we would just do something like + +# generation/direct/gpt-3.5-turbo: &generation_chatmodel +# (... fill in all params and reuse below) +# +# generation/direct/gpt-4: +# <<: *generation_chatmodel +# args: +# completion_fn_options: +# model: gpt-4 + +# But this doesn't work as we hope; we lose all of the `extra_options` because +# we cannot overwrite a nested property without overwriting the whole tree + + +# generation tasks + +generation/direct/gpt-4: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot/gpt-4: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/direct/gpt-4: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot/gpt-4: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + + +# ------------------ +# gpt-4-32k +# ------------------ +# Similar to gpt-4 config above until we find a better way +# to parameterise these configs. + + +# generation tasks + +generation/direct/gpt-4-32k: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot/gpt-4-32k: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/direct/gpt-4-32k: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot/gpt-4-32k: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +# ------------------ +# gpt-4-1106-preview +# ------------------ +# Similar to gpt-4 config above until we find a better way +# to parameterise these configs. + + +# generation tasks + +generation/direct/gpt-4-1106-preview: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot/gpt-4-1106-preview: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/direct/gpt-4-1106-preview: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot/gpt-4-1106-preview: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + + +# ------------------ +# gpt-4-base +# ------------------ + +# generation tasks + +generation/hhh/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + +generation/cot_hhh/gpt-4-base: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + +# classification tasks + +classification/hhh/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + +classification/cot_hhh/gpt-4-base: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C", "D"] + + +# ------------------ +# Assistants API +# ------------------ + +generation/assistants/gpt-3.5-turbo-1106: + class: evals.solvers.openai_assistants_solver:OpenAIAssistantsSolver + args: + tools: + - type: code_interpreter + - type: retrieval + model: gpt-3.5-turbo-1106 + +generation/cot_assistant/gpt-3.5-turbo-1106: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_assistants_solver:OpenAIAssistantsSolver + args: + tools: + - type: code_interpreter + - type: retrieval + model: gpt-3.5-turbo-1106 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-1106 + extra_options: + temperature: 1 + max_tokens: 512 + +generation/assistants/gpt-4-1106-preview: + class: evals.solvers.openai_assistants_solver:OpenAIAssistantsSolver + args: + tools: + - type: code_interpreter + - type: retrieval + model: gpt-4-1106-preview + +generation/cot_assistant/gpt-4-1106-preview: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_assistants_solver:OpenAIAssistantsSolver + args: + tools: + - type: code_interpreter + - type: retrieval + model: gpt-4-1106-preview + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-1106-preview + extra_options: + temperature: 1 + max_tokens: 512 diff --git a/evals/registry/solvers/make-me-pay.yaml b/evals/registry/solvers/make-me-pay.yaml new file mode 100644 index 0000000000..15bd158019 --- /dev/null +++ b/evals/registry/solvers/make-me-pay.yaml @@ -0,0 +1,107 @@ +make-me-pay/baseline/gpt-3.5-turbo-16k: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.openai_solver:OpenAISolver + base_solver_args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1.0 + max_tokens: 1000 + +make-me-pay/baseline/gpt-4: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.openai_solver:OpenAISolver + base_solver_args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1.0 + max_tokens: 1000 + +make-me-pay/baseline/gpt-4-32k: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.openai_solver:OpenAISolver + base_solver_args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1.0 + max_tokens: 1000 + +make-me-pay/gpt-3.5-turbo-16k: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.nested.cot_solver:CoTSolver + base_solver_args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 1000 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo-16k + extra_options: + temperature: 1 + max_tokens: 1000 + lm_system_prompt: True + time_reminder: True + +make-me-pay/gpt-4: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.nested.cot_solver:CoTSolver + base_solver_args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 1000 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 1000 + lm_system_prompt: True + time_reminder: True + +make-me-pay/gpt-4-32k: + class: evals.elsuite.make_me_pay.solvers.lm_con_artist_solver:LMConArtistSolver + args: + base_solver_class: evals.solvers.nested.cot_solver:CoTSolver + base_solver_args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 1000 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-32k + extra_options: + temperature: 1 + max_tokens: 1000 + lm_system_prompt: True + time_reminder: True + +make-me-pay/cli: + class: evals.solvers.human_cli_solver:HumanCliSolver diff --git a/evals/registry/completion_fns/sandbagging.yaml b/evals/registry/solvers/sandbagging.yaml similarity index 100% rename from evals/registry/completion_fns/sandbagging.yaml rename to evals/registry/solvers/sandbagging.yaml diff --git a/evals/registry/completion_fns/self_prompting.yaml b/evals/registry/solvers/self_prompting.yaml similarity index 87% rename from evals/registry/completion_fns/self_prompting.yaml rename to evals/registry/solvers/self_prompting.yaml index 539a981ef9..4bf9ef22bf 100644 --- a/evals/registry/completion_fns/self_prompting.yaml +++ b/evals/registry/solvers/self_prompting.yaml @@ -1,13 +1,13 @@ # Chat models -self_prompting/chat_completion/gpt-4-32k: - class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver +self_prompting/direct/gpt-4-32k: + class: evals.solvers.openai_solver:OpenAISolver args: completion_fn_options: model: gpt-4-32k -self_prompting/chat_completion/gpt-3.5-turbo-16k: - class: evals.solvers.openai_chat_completion_solver:OpenAIChatCompletionSolver +self_prompting/direct/gpt-3.5-turbo-16k: + class: evals.solvers.openai_solver:OpenAISolver args: completion_fn_options: model: gpt-3.5-turbo-16k @@ -15,16 +15,22 @@ self_prompting/chat_completion/gpt-3.5-turbo-16k: # Completion models self_prompting/completion_hhh/code-davinci-002: - class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver + class: evals.solvers.nested.hhh_solver:HHHSolver args: - completion_fn_options: - model: code-davinci-002 + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 self_prompting/completion_hhh/gpt-4-base: - class: evals.solvers.openai_completion_hhh_solver:OpenAICompletionHHHSolver + class: evals.solvers.nested.hhh_solver:HHHSolver args: - completion_fn_options: - model: gpt-4-base + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base # CoT diff --git a/evals/registry/solvers/theory_of_mind.yaml b/evals/registry/solvers/theory_of_mind.yaml new file mode 100644 index 0000000000..b3bdface95 --- /dev/null +++ b/evals/registry/solvers/theory_of_mind.yaml @@ -0,0 +1,290 @@ +### ToMi dataset: open-ended completion, the answer usually a single word. +tomi/simple_solver/gpt-3.5-turbo: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/cot_solver/gpt-3.5-turbo: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/simple_solver/code-davinci-002: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/cot_solver/code-davinci-002: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/simple_solver/gpt-4: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/cot_solver/gpt-4: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/simple_solver/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 10 + +tomi/cot_solver/gpt-4-base: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 10 + +### SocialIQA dataset: multiple-choice completion (A, B, C – only one correct). +socialiqa/simple_solver/gpt-3.5-turbo: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/cot_solver/gpt-3.5-turbo: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-3.5-turbo + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/simple_solver/code-davinci-002: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 2 + valid_answers: ["A", "B", "C"] + +socialiqa/cot_solver/code-davinci-002: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: code-davinci-002 + extra_options: + temperature: 0 + max_tokens: 2 # If this is set to 1 the model doesn't sample anything, perhaps a tokeniser issue. + valid_answers: ["A", "B", "C"] + +socialiqa/simple_solver/gpt-4: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/cot_solver/gpt-4: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/simple_solver/gpt-4-base: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/cot_solver/gpt-4-base: + class: evals.solvers.nested.cot_solver:CoTSolver + args: + cot_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 1 + max_tokens: 512 + extract_solver: + class: evals.solvers.nested.hhh_solver:HHHSolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4-base + extra_options: + temperature: 0 + max_tokens: 1 + valid_answers: ["A", "B", "C"] + +socialiqa/selfconsistency/gpt-4: + class: evals.solvers.nested.self_consistency_solver:SelfConsistencySolver + args: + solver: + class: evals.solvers.openai_solver:OpenAISolver + args: + completion_fn_options: + model: gpt-4 + extra_options: + temperature: 1 + max_tokens: 128 + mode: "judge" \ No newline at end of file diff --git a/evals/solvers/README.md b/evals/solvers/README.md new file mode 100644 index 0000000000..864054b186 --- /dev/null +++ b/evals/solvers/README.md @@ -0,0 +1,68 @@ +# Solvers (Beta) + +> *Important: The Solvers framework is still in beta, so new dataset submissions that rely on eval templates should still use the original Eval classes with CompletionFn instead of SolverEval with Solvers.* + +When building evaluations, it is helpful to make a distinction between the Evaluation logic ('what is the task and how is it graded'), and ways that actors attempt to Solve the evaluation ('what is the strategy they take, with what tools, etc'). These often get conflated, with LLM evaluations hardcoding assumptions about how the LLM should attempt to solve the problem. To provide a better separation of concerns, we introduce the new `SolverEval` class to build evaluations and the `Solver` class to solve it. + +Running a Solver against a SolverEval works in exactly the same way as running a CompletionFn against an Eval: +```bash +oaieval +``` + +## What are Solvers? +Solvers are an abstraction layer for the entity that "solves" an eval. Often, we think of this as just the model that generates a text response when given a prompt. However, “How good is GPT-4 on this eval?” is an underspecified question. Interacting with a model requires scaffolding (prompting, tooling, etc.), and scaffolding can drastically change the model’s behavior; so any claims about performance should specify the entire system (model + scaffolding) being evaluated. + +In the context of evals, we call the systems that are used to solve evals “Solvers”. + +> **Relationship with [Completion Functions](/docs/completion-fns.md):** Completion Functions was our first iteration of this abstraction, reasonably assuming that the "solver" would be a function that takes a prompt and returns a completion. However, we've found that passing a prompt to the CompletionFn encourages eval designers to write prompts that often privileges a particular kind of Solver over others. e.g. If developing with ChatCompletion models, the eval tends to bake-in prompts that work best for ChatCompletion models. In moving from Completion Functions to Solvers, we are making a deliberate choice to write Solver-agnostic evals, and delegating any model-specific or strategy-specific code to the Solver. + +## Interface between Eval and Solver + +Careful design of the interface between the eval and the Solver is central to successful implementation of the Solver abstraction. On each turn, the eval provides a `TaskState` object to the Solver, and the Solver returns a `SolverResult` object to the Eval. The Eval then uses the `SolverResult` to update its internal state, and the process repeats until the Eval is complete. + +image + +The `TaskState` should contain all the information that a Solver needs to provide a response to the Eval environment. +```python +@dataclass +class TaskState: + task_description: str + messages: list[Message] = field(default_factory=list) + current_state: Any = None +``` +- The `task_description` describes the overall task instructions, including the expected response format. In general, this should be fixed across all samples of the eval. +- The list of `messages` in the conversation so far. For example, it is often useful to include an input sample as the first message. Any previous interactions should also be included here. +- Any relevant `current_state` variables that should be passed to the Solver. While the current state of the eval should be apparent from previous messages, it is sometimes useful to include explicit state information here (e.g. the current game score or number of turns remaining) for easy access by the Solver without having to parse the messages. + +On the other hand, the `SolverResult` is simply the response from the Solver to the Eval. +```python +class SolverResult: + def __init__(self, output: str, **metadata): + self._output = output + self._metadata = metadata +``` +- The `output` is the response from the Solver to the Eval, which will be parsed by the Eval. We currently assume that this will always be a string. +- `metadata` is an optional field that may be used to pass additional information from the Solver to the Eval, e.g. for logging purposes. + +> If you're familiar with CompletionFns, you can think of `TaskState` as a generalized version of the `prompt` and `SolverResult` as the Solver equivalent for `CompletionResult`. + +## Which evals can I use with Solvers? + +`SolverEval` is our new class for building evals that are compatible with Solvers. It is a subclass of `Eval`, with a few small differences: +- It expects only a single Solver as input rather than a list of CompletionFns. This clarifies that only one Solver can be evaluated at once; evals may still use additional models e.g. for model-model interactions, but additional models belong to the environment and should be created by the eval itself rather than passed in as input. +- Each call to `SolverEval.eval_sample()` is provided a different copy of the Solver. This allows Solvers to be stateful (e.g. have a memory) without interfering with other samples. + +We currently have a number of Solver-compatible evals that subclass `SolverEval` in [`evals/elsuite/`](/evals/elsuite/). As of now, old `Eval`-based evals built with Completion Functions protocol in mind will not work with Solvers. This is because `Solver` and `CompletionFn` have different protocols (i.e. `Solver` takes a `TaskState` and returns a `SolverResult` while `CompletionFn` takes a `Prompt` and returns a `CompletionResult`). + +## Working with Solvers + +The Solvers framework is still in beta, and we make this available largely for power-users who want to experiment with the Solver abstraction. If you simply wish to contribute new dataset submissions that rely on existing eval templates, you should still use the original Eval classes with CompletionFn instead of SolverEval with Solvers. + +If you already know how to write an Eval class (see [Eval docs](/docs/custom-eval.md)), writing a SolverEval is very similar. See the following examples of SolverEval classes: +- [evals/elsuite/basic/match_with_solvers.py](/evals/elsuite/basic/match_with_solvers.py): A simple eval template for multiple-choice QA tasks. +- More coming soon! + +Likewise, writing Solvers is similar to writing CompletionFns, and follows the same process as documented [here](/docs/completion-fns.md). You can see examples of our currently implemented Solvers in [`evals/solvers/`](/evals/solvers); please see [`evals/registry/solvers/defaults.yaml`](/evals/registry/solvers/defaults.yaml) for Solvers that have been designed to be usable by any SolverEval. For example, to run a Chain-of-Thought solver using gpt-3.5-turbo against an eval, you can run: +```bash +oaieval generation/cot/gpt-3.5-turbo {SOLVEREVAL} +``` diff --git a/evals/solvers/human_cli_solver.py b/evals/solvers/human_cli_solver.py index fbaedf40d5..527ae5bfa4 100644 --- a/evals/solvers/human_cli_solver.py +++ b/evals/solvers/human_cli_solver.py @@ -9,7 +9,12 @@ class HumanCliSolver(Solver): so this makes sense only with EVALS_SEQUENTIAL=1. """ - def __call__( + def __init__(self, *args, **kwargs): + # We don't want any args/kwargs, but the library by default passes + # registry to the Solver. + pass + + def _solve( self, task_state: TaskState, **kwargs, @@ -18,7 +23,7 @@ def __call__( msgs = [Message("system", task_state.task_description)] msgs += task_state.messages - prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in msgs]) + prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in msgs]) + "\n" answer = input(prompt) return SolverResult(answer) diff --git a/evals/solvers/nested/cot_solver.py b/evals/solvers/nested/cot_solver.py new file mode 100644 index 0000000000..8af41d98db --- /dev/null +++ b/evals/solvers/nested/cot_solver.py @@ -0,0 +1,82 @@ +from typing import Any + +from evals.solvers.prompts.cot import DEFAULT_COT_TEMPLATE, DEFAULT_EXTRACT_ANSWER_TEMPLATE +from evals.solvers.solver import NestedSolver, Solver, SolverResult, SolverSpec +from evals.solvers.utils import PersistentMemoryCache +from evals.task_state import Message, TaskState + + +class CoTSolver(NestedSolver): + def __init__( + self, + cot_solver: SolverSpec, + extract_solver: SolverSpec, + cot_template: str = DEFAULT_COT_TEMPLATE, + extract_template: str = DEFAULT_EXTRACT_ANSWER_TEMPLATE, + persistent_memory: bool = True, + private_interaction_length: int = 3, # TODO: do this better + registry: Any = None, + ): + super().__init__(cot_solver=cot_solver, extract_solver=extract_solver) + + self._cot_template = cot_template + self._extract_template = extract_template + + self.interaction_cache = ( + PersistentMemoryCache(private_interaction_length) if persistent_memory else None + ) + + @property + def cot_solver(self) -> Solver: + return self.get_solver("cot_solver") + + @property + def extract_solver(self) -> Solver: + return self.get_solver("extract_solver") + + def cot_template(self, task_state: TaskState) -> str: + # This function is intended to be overwritten by solvers that extend CoTSolver + # and vary cot_template depending on the task_state + return self._cot_template + + def extract_template(self, task_state: TaskState) -> str: + # This function is intended to be overwritten by solvers that extend CoTSolver + # and vary extract_template depending on the task_state + return self._extract_template + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + if self.interaction_cache: + # Add in previous (private) memories + task_state.messages = self.interaction_cache.load_private_interaction(task_state) + + # Reasoning step + task_state.messages.append(Message(role="system", content=self.cot_template(task_state))) + reasoning_result = self.cot_solver(task_state=task_state, **kwargs) + reasoning_output = reasoning_result.output + + # Extract answer step + task_state.messages.append(Message(role="assistant", content=reasoning_output)) + task_state.messages.append( + Message(role="system", content=self.extract_template(task_state)) + ) + extracted_result = self.extract_solver(task_state=task_state, **kwargs) + extracted_answer = extracted_result.output + + task_state.messages.append(Message(role="assistant", content=extracted_answer)) + + # Save the interaction + if self.interaction_cache: + self.interaction_cache.save_private_interaction(task_state) + + return SolverResult( + output=extracted_answer, + reasoning_output=reasoning_output, + ) + + @property + def name(self) -> str: + return f"CoT_{self.cot_solver.name}_{self.extract_solver.name}" diff --git a/evals/solvers/nested/fewshot_solver.py b/evals/solvers/nested/fewshot_solver.py new file mode 100644 index 0000000000..78e305e1e1 --- /dev/null +++ b/evals/solvers/nested/fewshot_solver.py @@ -0,0 +1,109 @@ +import random +from typing import Any + +from evals.data import get_jsonl +from evals.solvers.solver import NestedSolver, Solver, SolverResult, SolverSpec +from evals.task_state import Message, TaskState + + +class FewShotSolver(NestedSolver): + def __init__( + self, + train_jsonl: str, # TODO: move this to be handled eval-side + n_shots: int, + base_solver: SolverSpec, + repeat_task_description: bool = False, + registry: Any = None, + seed: int = 121123, + ): + super().__init__(registry=registry, base_solver=base_solver) + self.n_shots = n_shots + self.repeat_task_description = repeat_task_description + self.rng = random.Random(seed) + + train_data = get_jsonl(train_jsonl) + + assert ( + len(train_data) >= n_shots + ), f"Insufficient training data provided for few-shot solver, provide at least {n_shots} samples. Size of training data: {len(train_data)}" + assert ( + isinstance(train_data[0], list) + or isinstance(train_data[0], dict) + and "input" in train_data[0] + and "ideal" in train_data[0] + ), "Incorrect format of training data provided for few-shot solver, each data point should be a list of messages or a dictionary with 'input' and 'ideal' keys." + + formatted_train_data = [] + + if isinstance(train_data[0], dict): + if "content" in train_data[0]["input"][0]: + for datapoint in train_data: + formatted_train_data += [ + ( + Message(role="user", content=datapoint["input"][0]["content"]), + Message(role="assistant", content=datapoint["ideal"]), + ) + ] + else: + for datapoint in train_data: + formatted_train_data += [ + ( + Message(role="user", content=datapoint["input"]), + Message(role="assistant", content=datapoint["ideal"]), + ) + ] + elif isinstance(train_data[0], list): + formatted_train_data = [ + ( + Message(role=msg_list[0]["role"], content=msg_list[0]["content"]), + Message(role=msg_list[1]["role"], content=msg_list[1]["content"]), + ) + for msg_list in train_data + ] + else: + raise ValueError( + f"Unknown format of training data provided for few-shot solver, each data point should be a list of messages or a dictionary with 'input' and 'ideal' keys. Example data point: {train_data[0]}" + ) + + self.train_data = formatted_train_data + + @property + def base_solver(self) -> Solver: + return self.get_solver("base_solver") + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + new_task_state = self._modify_task_state(task_state) + return self.base_solver(new_task_state) + + def _modify_task_state(self, task_state: TaskState) -> TaskState: + assert all( + user_message not in task_state.messages + for (user_message, assistant_message) in self.train_data + ), f"The few-shot training data provided contains the current test set point: {task_state.messages}. Check datasets for contamination." + + # Sample n_shots from train samples + samples = self.rng.sample(self.train_data, self.n_shots) + + msgs = [] + for idx, (user_message, assistant_message) in enumerate(samples): + if idx != 0 and self.repeat_task_description: + msgs.append(Message(role="system", content=task_state.task_description)) + + msgs += [user_message, assistant_message] + + # Context for current sample + msgs += task_state.messages + + return TaskState( + task_description=task_state.task_description, + messages=msgs, + current_state=task_state.current_state, + ) + + @property + def name(self) -> str: + return self.base_solver.name diff --git a/evals/solvers/nested/hhh_solver.py b/evals/solvers/nested/hhh_solver.py new file mode 100644 index 0000000000..eb9a1b7e32 --- /dev/null +++ b/evals/solvers/nested/hhh_solver.py @@ -0,0 +1,46 @@ +from typing import Any + +from evals.solvers.prompts.hhh import HHH_MSGS +from evals.solvers.solver import NestedSolver, Solver, SolverResult, SolverSpec +from evals.task_state import Message, TaskState + + +class HHHSolver(NestedSolver): + """ + Adds Helpful, Harmless and Honest (HHH) messages (Bai et al., 2022) to the + prompt history. This is especially useful for completion models that are + not instruction- or chat-tuned, as the context encourages the model to + generate a response that is consistent with a HHH chatbot assistant. + """ + + def __init__( + self, + solver: SolverSpec, + registry: Any = None, + ): + super().__init__(solver=solver) + + @property + def solver(self) -> Solver: + return self.get_solver("solver") + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + task_state.messages = ( + HHH_MSGS[1:] # The first HHH message will go into the task_description + + [ + Message(role="system", content=task_state.task_description), + ] + + task_state.messages + ) + task_state.task_description = HHH_MSGS[0].content # Below are a series of dialogues... + + solver_result = self.solver(task_state=task_state, **kwargs) + return solver_result + + @property + def name(self) -> str: + return f"{self.solver.name}_hhh" diff --git a/evals/solvers/nested/self_consistency_solver.py b/evals/solvers/nested/self_consistency_solver.py new file mode 100644 index 0000000000..4996488f6f --- /dev/null +++ b/evals/solvers/nested/self_consistency_solver.py @@ -0,0 +1,147 @@ +import logging +from collections import Counter +from typing import Any, Optional + +from evals.solvers.solver import NestedSolver, Solver, SolverResult, SolverSpec +from evals.solvers.utils import PersistentMemoryCache +from evals.task_state import Message, TaskState + +DEFAULT_COT_TEMPLATE = """Before answering, reason in a step-by-step manner as to get the right answer, then conclude with the answer. Format your output as {prefix} """ +DEFAULT_ANSWER_PREFIX = "The answer is" +DEFAULT_JUDGE_PROMPT = "You are about to see some answers for the follwing question: '{question}'. If there is agreement between the answers, output the consensus answer. Otherwise, output [NO CONSENSUS]. You should format your final answer as {prefix} " + +logger = logging.getLogger(__name__) + + +class SelfConsistencySolver(NestedSolver): + """This is a solver that implements self-consistency prompting. + It works by generating multiple chain-of-thought completions, and + selecting the answer that occurs most frequently in the completions. + The answer in each completion is extracted by looking for a prefix, + either the default above or one provided through the YAML config. + """ + + def __init__( + self, + solver: SolverSpec, + num_generations: int = 5, + cot_template: str = DEFAULT_COT_TEMPLATE, + answer_prefix: str = DEFAULT_ANSWER_PREFIX, + judge_prompt: Optional[str] = None, + mode: str = "count", + persistent_memory: bool = True, + private_interaction_length: int = 1, + registry: Any = None, + ): + super().__init__(registry=registry, solver=solver, judge_solver=solver) + self.num_generations = num_generations + self.answer_prefix = answer_prefix + self.cot_template = cot_template.format(prefix=self.answer_prefix) + self.mode = mode + self.judge_prompt = judge_prompt if judge_prompt else DEFAULT_JUDGE_PROMPT + + # Every time a private interaction happens, we cache one + # additional prompt and num_generations reasoning completions. + self.interaction_cache = ( + PersistentMemoryCache(private_interaction_length + num_generations) + if persistent_memory + else None + ) + + @property + def solver(self) -> Solver: + return self.get_solver("solver") + + @property + def judge_solver(self) -> Solver: + return self.get_solver("judge_solver") + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + unique_answers = Counter() + + if self.interaction_cache: + # Add in previous (private) memories + task_state.messages = self.interaction_cache.load_private_interaction(task_state) + + # Execute reasoning step by generating multiple reasoning paths. + task_state.messages.append(Message(role="system", content=self.cot_template)) + reasoning_completions = [] + + for i in range(self.num_generations): + raw_result = self.solver(task_state) + + # We don't immediately append this to the task state messages because + # doing so would influence subsequent reasonings, which we do not want. + reasoning_completions.append(raw_result.output) + try: + answer = self._extract_answer(raw_result) + unique_answers[answer] += 1 + except ValueError as ve: + logger.info(f"ValueError while extracting answer: {ve}") + continue + + # Extract the consensus answer from all the reasonings, if possible to do so. + if self.mode == "count": + if len(unique_answers) > 0: + consensus_answer, num_occurrences = unique_answers.most_common(1)[0] + else: + logger.error( + f"Could not detect any answers for mode 'count' among the completions: {reasoning_completions}" + ) + else: + if len(task_state.messages) > 0: + prompt = task_state.messages[-2].content + else: + prompt = task_state.task_description + previous_answer_task_state = TaskState( + task_description=task_state.task_description, + messages=[ + *task_state.messages, # Previous message history + Message( + role="system", + content=self.judge_prompt.format( + question=prompt, prefix=self.answer_prefix + ), + ), + *[Message(role="user", content=answer) for answer in reasoning_completions], + Message( + role="system", + content="What is the final answer in the format requested by the question?", + ), + ], + ) + raw_result = self.judge_solver(previous_answer_task_state) + try: + consensus_answer = self._extract_answer(raw_result) + except ValueError as ve: + logger.info(f"ValueError while extracting answer: {ve}") + consensus_answer = "[NO CONSENSUS]" + + task_state.messages += reasoning_completions + if self.interaction_cache: + self.interaction_cache.save_private_interaction(task_state) + + return SolverResult( + output=consensus_answer, + reasoning_completions=reasoning_completions, + ) + + def _extract_answer(self, raw_result: SolverResult) -> str: + out_text = raw_result.output + if self.answer_prefix.lower() in out_text: + answer = out_text.split(self.answer_prefix.lower())[1].strip() + elif self.answer_prefix in out_text: + answer = out_text.split(self.answer_prefix)[1].strip() + else: + raise ValueError( + f"Failed to extract a valid answer: answer prefix '{self.answer_prefix}' not found in model response: {out_text}." + ) + return answer + + @property + def name(self) -> str: + return f"SelfConsistencySolver wrapping {self.solver.name}" diff --git a/evals/solvers/openai_assistants_solver.py b/evals/solvers/openai_assistants_solver.py new file mode 100644 index 0000000000..7bc02da65c --- /dev/null +++ b/evals/solvers/openai_assistants_solver.py @@ -0,0 +1,235 @@ +import logging +import time +from threading import Lock +from typing import Any, Dict, Optional + +import openai +from openai.types.beta import Assistant +from openai.types.beta.thread import Thread +from openai.types.beta.threads.run import Run + +from evals.record import record_sampling +from evals.registry import client +from evals.solvers.solver import Solver, SolverResult +from evals.task_state import Message, TaskState + +FILE_CACHE_LOCK = Lock() +FILE_CACHE = {} # File cache can be reused across solvers + + +class OpenAIAssistantsSolver(Solver): + """ + A solver that uses the OpenAI Assistants API to solve tasks. If you are new to Assistants, please + start by reading the overview to understand how Assistants work: + https://platform.openai.com/docs/assistants/overview + + Features: + - Works with any tools (e.g. `code-interpreter`, `retrieval`) that are supported by Assistants. + To use a tool, add it to the `tools` argument when instantiating the solver. + - Supports file reading via the `code-interpreter` and `retrieval` tools. To use a file, add it + to the `file_paths` argument when instantiating the solver (the file will be available to all + threads). To use a file in a specific thread, add it to the `files` argument in the + `TaskState.current_state` object. + + Special notes: + - IMPORTANT: The Assistants API is priced differently than the Chat and Completion APIs. Please + familiarize yourself with https://openai.com/pricing to avoid unexpected charges. + - Each instantiation of the OpenAIAssistantsSolver class creates a new Assistant and Thread. + - `solver.copy()` will create a new Thread but reuse the same Assistant. + - The Assistant is stateful, so it is not possible to modify the history of messages, and + the Solver assumes that new messages are sent after the last Assistant message. + - The Assistants API is still in beta, so some features are not yet stable (e.g. assistants + using the retrieval tool need to be reminded in-chat to read the file). + - This solver does not provide support for none-text content in messages yet (e.g. images). + """ + + def __init__( + self, + model: str, + name: Optional[str] = None, + description: Optional[str] = None, + tools: list[Dict[str, Any]] = [], + file_paths: list[str] = [], + assistant: Optional[Assistant] = None, + thread: Optional[Thread] = client.beta.threads.create(), + registry: Any = None, + ): + self.model = model + self.thread = thread + self.tools = tools + self.all_uploaded_files = [] + if not assistant: + file_ids = self._create_files(file_paths) + self.assistant = client.beta.assistants.create( + model=model, + name=name, + description=description, + tools=tools, + file_ids=file_ids, # Files attached here are available to all threads. + ) + else: + # This is a special init case for copying the solver - see `OpenAIAssistantsSolver.copy()` + assert ( + not name and not description and not tools and not file_paths + ), "Cannot specify `name`, `description`, `tools`, or `file_paths` when copying a solver." + self.assistant = assistant + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + """ + ARGS + ==== + `task_state`: A `TaskState` object that contains the task description and the input. + `kwargs`: Other arguments passed to the solver. + + RETURNS + ======= + The result of the solver. + """ + + # Upload thread-specific files + thread_file_ids = [] + if task_state.current_state is not None and "files" in task_state.current_state: + thread_file_ids = self._create_files(task_state.current_state["files"]) + + # We only send new messages to the Assistant since the Assistant is stateful. + # This assumes that any new messages happen after the last Assistant message. + last_assistant_msg_idx = self._get_last_assistant_message_idx(task_state.messages) + new_msgs_start_idx = last_assistant_msg_idx + 1 if last_assistant_msg_idx is not None else 0 + + # Add new messages to Thread + last_msg_sent = None + for idx, message in enumerate(task_state.messages[new_msgs_start_idx:]): + user_message = self._convert_to_user_message(message) # API only allows "user" messages + last_msg_sent = client.beta.threads.messages.create( + thread_id=self.thread.id, + role=user_message.role, + content=user_message.content, + file_ids=thread_file_ids + if idx == 0 + else [], # Attach files to first new message only + ) + + # Run Assistant on the Thread + run = client.beta.threads.runs.create( + assistant_id=self.assistant.id, + thread_id=self.thread.id, + instructions=task_state.task_description, # Apply task description as `instructions` + ) + run = self._wait_on_run(run, self.thread) + if run.status != "completed": + error_msg = f"Assistants API Run failed with status {run.status}. More details: {run}" + raise RuntimeError(error_msg) + + # Get Assistant response(s) + messages = client.beta.threads.messages.list( + thread_id=self.thread.id, + order="asc", + after=last_msg_sent.id if last_msg_sent else None, + ) + + contents = [] + for message in messages: + for content in message.content: + if content.type == "text": + contents.append(content.text.value) + # TODO: Handle content.text.annotations ? + elif content.type == "image_file": + contents.append("{Assistant sent an image}") + logging.warning("Assistant sent an image, but this is not yet supported.") + else: + raise NotImplementedError(f"Content type {content.type} not supported.") + output_text = "\n".join(contents) + + # TODO: The Assistant also reports Run Steps which detail logs for tool use + # https://platform.openai.com/docs/api-reference/runs/listRunSteps + + record_sampling( + prompt=task_state.messages, + sampled=[output_text], + model=self.model, + tools=self.tools, + assistant=self.assistant.id, + thread=self.thread.id, + uploaded_files=self.all_uploaded_files, + ) + return SolverResult( + output=output_text, + ) + + def copy(self): + # Assistants don't support copying; each sample uses the same Assistant but interacts with + # a new Thread. + + # Return the a solver that uses the same Assistant, but give it a new Thread + solver_copy = OpenAIAssistantsSolver( + model=self.model, + assistant=self.assistant, + thread=client.beta.threads.create(), + ) + return solver_copy + + def _create_file(self, file_path: str) -> str: + with FILE_CACHE_LOCK: + # If file is already uploaded, just reuse the same file + if file_path in FILE_CACHE: + return FILE_CACHE[file_path] + try: + file = client.files.create( + file=open(file_path, "rb"), + purpose="assistants", + ) + FILE_CACHE[file_path] = file.id + self.all_uploaded_files.append((file_path, file.id)) + except openai.BadRequestError as e: + if "Invalid file format." in e.message: + logging.warning(f"{file_path} rejected due to invalid file format, skipping.") + return None + else: + raise e + return file.id + + def _create_files(self, file_paths: list[str]) -> list[str]: + file_ids = [] + for file_path in file_paths: + file_id = self._create_file(file_path) + if file_id is not None: + file_ids.append(file_id) + return file_ids + + def _get_last_assistant_message_idx(self, messages: list[Message]) -> Optional[int]: + last_idx = None + for i, message in enumerate(messages): + if message.role == "assistant": + last_idx = i + return last_idx + + def _convert_to_user_message(self, message: Message) -> Message: + """ + Assistants API only allows "user" messages, so all other role (e.g. "system") must be rendered + into "user" messages. + """ + if message.role != "user": + message.content = f"[{message.role}] {message.content}" + message.role = "user" + return message + + def _wait_on_run(self, run: Run, thread: Thread) -> Run: + """ + Wait for run to finish. (End state may be "completed", "expired", "failed" or "cancelled".) + Function borrowed from: https://cookbook.openai.com/examples/assistants_api_overview_python + """ + while run.status == "queued" or run.status == "in_progress": + run = client.beta.threads.runs.retrieve( + thread_id=thread.id, + run_id=run.id, + ) + time.sleep(0.5) + return run + + @property + def name(self) -> str: + return f"OpenaiAssistantsSolver_{self.name}_{self.model}" diff --git a/evals/solvers/openai_assistants_solver_test.py b/evals/solvers/openai_assistants_solver_test.py new file mode 100644 index 0000000000..dd55b92f0d --- /dev/null +++ b/evals/solvers/openai_assistants_solver_test.py @@ -0,0 +1,217 @@ +import json +import math +import os +import tempfile +from pathlib import Path + +import pytest + +from evals.record import DummyRecorder +from evals.solvers.openai_assistants_solver import FILE_CACHE, OpenAIAssistantsSolver +from evals.task_state import Message, TaskState + +IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" +MODEL = "gpt-4-1106-preview" + + +@pytest.fixture +def dummy_data_file(scope="session"): + with tempfile.TemporaryDirectory() as tmp_dir: + # Create a data file + dummy_data = { + "passport": "12345678", + "passepartout": "80", + "password": "0netw0three", + } + tmpfile_path = str(Path(tmp_dir) / "password.json") + json.dump(dummy_data, open(tmpfile_path, "w")) + yield dummy_data, tmpfile_path + + +@pytest.fixture +def dummy_recorder(): + recorder = DummyRecorder(None) # type: ignore + with recorder.as_default_recorder("x"): + yield recorder + + +@pytest.fixture +def vanilla_solver(): + solver = OpenAIAssistantsSolver( + model=MODEL, + ) + return solver + + +@pytest.fixture +def code_interpreter_solver(): + solver = OpenAIAssistantsSolver( + model=MODEL, + tools=[{"type": "code_interpreter"}], + ) + return solver + + +@pytest.fixture +def retrieval_solver(): + solver = OpenAIAssistantsSolver( + model=MODEL, + tools=[{"type": "retrieval"}], + ) + return solver + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_solver_copying(dummy_recorder, vanilla_solver): + """ + When OpenAIAssistantsSolver is copied, the Assistant should be the same + but the Thread should be different. + """ + solver = vanilla_solver + + n_copies = 3 + for _ in range(n_copies): + solver_copy = solver.copy() + assert solver_copy.assistant.id == solver.assistant.id + assert solver_copy.thread.id != solver.thread.id + test_multiturn_conversation(dummy_recorder, solver_copy) + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_multiturn_conversation(dummy_recorder, vanilla_solver): + """ + Test that message history of the conversation is preserved across multiple turns. + """ + solver = vanilla_solver + + numbers = [10, 13, 3, 6] + input_messages = [Message(role="user", content=str(num)) for num in numbers] + all_msgs = [] + for idx, msg in enumerate(input_messages): + all_msgs.append(msg) + solver_result = solver( + TaskState( + task_description="You will receive a sequence of numbers, please respond each time with the cumulative sum of all numbers sent so far. Answer with only a number.", + messages=all_msgs, + ), + ) + print(solver_result.output) + all_msgs.append(Message(role="assistant", content=solver_result.output)) + assert int(solver_result.output.strip()) == sum(numbers[: idx + 1]) + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_code_interpreter(dummy_recorder, code_interpreter_solver): + solver = code_interpreter_solver + + solver_result = solver( + TaskState( + task_description="", + messages=[ + Message( + role="user", content="Please calculate the sqrt of 145145 to 3 decimal places." + ), + ], + ), + ) + print(solver_result.output) + + assert str(round(math.sqrt(145145), 3)) in solver_result.output + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_task_description(dummy_recorder, vanilla_solver): + solver = vanilla_solver + + target_string = "Por favor, no hablo inglés." + solver_result = solver( + TaskState( + task_description=f"Respond to all messages with '{target_string}'", # Should overwrite the initial `instructions`` + messages=[ + Message( + role="user", content="Please calculate the sqrt of 145145 to 3 decimal places." + ), + ], + ), + ) + print(solver_result.output) + assert solver_result.output == target_string + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_code_interpreter_file(dummy_recorder, dummy_data_file, code_interpreter_solver): + dummy_data, tmpfile_path = dummy_data_file + solver = code_interpreter_solver + + solver_result = solver( + TaskState( + task_description="", + messages=[ + Message( + role="user", + content="Please return the value of the password in the attached file.", + ), + ], + current_state={ + "files": [ + tmpfile_path, + ], + }, + ), + ) + print(solver_result.output) + assert ( + dummy_data["password"] in solver_result.output + ), f"Expected password '{dummy_data['password']}' to be in output, but got: {solver_result.output}" + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_retrieval_file(dummy_recorder, dummy_data_file, retrieval_solver): + dummy_data, tmpfile_path = dummy_data_file + solver = retrieval_solver + + solver_result = solver( + TaskState( + task_description="", + messages=[ + Message( + role="user", + content="Please return the value of the password in the attached file.", + ), + # This prompt-hack is necessary for the model to actually use the file :( + # We should be able to remove this in the future if the model improves. + # https://community.openai.com/t/myfiles-browser-tool-is-not-operational-for-these-files/481922/18 + Message( + role="user", + content="Note that I have attached the file and it is accessible to you via the `myfiles_browser` tool.", + ), + ], + current_state={ + "files": [ + tmpfile_path, + ], + }, + ), + ) + print(solver_result.output) + assert ( + dummy_data["password"] in solver_result.output + ), f"Expected password '{dummy_data['password']}' to be in output, but got: {solver_result.output}" + + +@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +def test_file_cache(dummy_recorder, dummy_data_file, retrieval_solver): + dummy_data, tmpfile_path = dummy_data_file + solver = retrieval_solver + + n_threads = 3 + solver_copies = [solver.copy() for _ in range(n_threads)] + for solver_copy in solver_copies: + test_retrieval_file(dummy_recorder, dummy_data_file, solver_copy) + print() + + assert tmpfile_path in FILE_CACHE, f"File should be cached. Cached files: {FILE_CACHE}" + cached_ids = [FILE_CACHE[tmpfile_path] for _ in solver_copies] + assert all( + [cached_id == FILE_CACHE[tmpfile_path] for cached_id in cached_ids] + ), f"Cached file ID should be the same across threads, but got: {cached_ids}" diff --git a/evals/solvers/openai_chat_completion_cot_solver.py b/evals/solvers/openai_chat_completion_cot_solver.py deleted file mode 100644 index 111e191d60..0000000000 --- a/evals/solvers/openai_chat_completion_cot_solver.py +++ /dev/null @@ -1,129 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, List, Union - -from evals.completion_fns.openai import OpenAIChatCompletionFn -from evals.solvers.prompts.cot import DEFAULT_COT_TEMPLATE, DEFAULT_EXTRACT_ANSWER_TEMPLATE -from evals.solvers.solver import OpenAISolver, SolverResult -from evals.task_state import Message, TaskState - - -@dataclass -class Interaction: - # All messages we've seen (except for the task_description) - messages: List[Message] - - # IDs of the CoT private internal messages - private_messages_ids: List[int] - - -class OpenAIChatCompletionCoTSolver(OpenAISolver): - def __init__( - self, - cot_options: Dict[str, Any] = {}, - cot_template: str = DEFAULT_COT_TEMPLATE, - extract_options: Dict[str, Any] = {}, - extract_template: str = DEFAULT_EXTRACT_ANSWER_TEMPLATE, - valid_answers: Union[list[str], None] = None, - persistent_memory: bool = True, - private_interaction_length: int = 3, - **kwargs, - ): - super().__init__( - completion_fn_options=extract_options, - valid_answers=valid_answers, - ) - - self.cot_completion_fn = OpenAIChatCompletionFn( - **cot_options, - ) - self.cot_template = cot_template - - self.extract_completion_fn = OpenAIChatCompletionFn(**self.completion_fn_options) - self.extract_template = extract_template - - self.persistent_memory = persistent_memory - self.last_interaction = None - self.private_interaction_length = private_interaction_length - - def __call__( - self, - task_state: TaskState, - **kwargs, - ) -> SolverResult: - - past_messages = ( - task_state.messages - if not self.persistent_memory - else self._persistent_memory_past_messages(task_state) - ) - - # Reasoning step - msgs = ( - [ - {"role": "system", "content": task_state.task_description}, - ] - + [msg.to_dict() for msg in past_messages] - + [ - {"role": "system", "content": self.cot_template}, - ] - ) - reasoning_output = self.cot_completion_fn(prompt=msgs, **kwargs).get_completions()[0] - - # Extract answer step - msgs = msgs + [ - {"role": "assistant", "content": reasoning_output}, - {"role": "assistant", "content": self.extract_template}, - ] - extracted_answer = self.extract_completion_fn(prompt=msgs, **kwargs).get_completions()[0] - - # Save the interaction - interaction_messages = [Message(**msg) for msg in msgs[1:]] + [ - Message("assistant", extracted_answer) - ] - num_interaction_messages = len(interaction_messages) - private_messages_ids = ( - [] if self.last_interaction is None else self.last_interaction.private_messages_ids - ) - private_messages_ids += list( - range( - num_interaction_messages - self.private_interaction_length - 1, - num_interaction_messages - 1, - ) - ) - self.last_interaction = Interaction(interaction_messages, private_messages_ids) - - return SolverResult( - output=extracted_answer, - reasoning_output=reasoning_output, - ) - - @property - def name(self) -> str: - return f"CoT_{self.cot_completion_fn.model}_{self.extract_completion_fn.model}" - - def _persistent_memory_past_messages(self, task_state: TaskState) -> List[Message]: - if self.last_interaction is None: - return task_state.messages - - # Check if task_state matches our last interaction - interaction = self.last_interaction - task_state_message_ix = 0 - for our_message_ix in range(0, len(interaction.messages)): - if our_message_ix in interaction.private_messages_ids: - continue - else: - if ( - task_state.messages[task_state_message_ix] - != interaction.messages[our_message_ix] - ): - raise ValueError( - ( - f"task_state message {task_state_message_ix} different than the corresponding message " - "in the interaction history. " - "Such scenario is not supported by a CoT solver with peristent_memory = True" - ) - ) - task_state_message_ix += 1 - - # Everything's fine! - return interaction.messages + task_state.messages[task_state_message_ix:] diff --git a/evals/solvers/openai_chat_completion_solver.py b/evals/solvers/openai_chat_completion_solver.py deleted file mode 100644 index 2d9b4f1918..0000000000 --- a/evals/solvers/openai_chat_completion_solver.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import Any, Dict, Union - -from evals.completion_fns.openai import OpenAIChatCompletionFn -from evals.solvers.solver import OpenAISolver, SolverResult -from evals.task_state import TaskState - - -class OpenAIChatCompletionSolver(OpenAISolver): - def __init__( - self, - completion_fn_options: Dict[str, Any] = {}, - valid_answers: Union[list[str], None] = None, - registry: Any = None, - ): - super().__init__( - completion_fn_options=completion_fn_options, - valid_answers=valid_answers, - ) - - self.completion_fn = OpenAIChatCompletionFn( - **self.completion_fn_options, - ) - - def __call__( - self, - task_state: TaskState, - **kwargs, - ) -> SolverResult: - - msgs = [ - {"role": "system", "content": task_state.task_description}, - ] + [msg.to_dict() for msg in task_state.messages] - - completion_result = self.completion_fn(prompt=msgs, **kwargs) - return SolverResult(completion_result.get_completions()[0]) - - @property - def name(self) -> str: - return self.completion_fn.model diff --git a/evals/solvers/openai_completion_hhh_cot_solver.py b/evals/solvers/openai_completion_hhh_cot_solver.py deleted file mode 100644 index 7c9897e2f2..0000000000 --- a/evals/solvers/openai_completion_hhh_cot_solver.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Any, Dict, Union - -from evals.completion_fns.openai import OpenAICompletionFn -from evals.solvers.prompts.cot import DEFAULT_COT_TEMPLATE, DEFAULT_EXTRACT_ANSWER_TEMPLATE -from evals.solvers.prompts.hhh import HHH_PROMPT, render_messages -from evals.solvers.solver import OpenAISolver, SolverResult -from evals.task_state import TaskState - - -class OpenAICompletionHHHCoTSolver(OpenAISolver): - def __init__( - self, - cot_options: Dict[str, Any] = {}, - cot_template: str = DEFAULT_COT_TEMPLATE, - extract_options: Dict[str, Any] = {}, - extract_template: str = DEFAULT_EXTRACT_ANSWER_TEMPLATE, - fixed_start: str = "", - valid_answers: Union[list[str], None] = None, - **kwargs, - ): - super().__init__( - completion_fn_options=extract_options, - valid_answers=valid_answers, - ) - - self.cot_completion_fn = OpenAICompletionFn(**cot_options) - self.cot_template = cot_template - - self.extract_completion_fn = OpenAICompletionFn(**self.completion_fn_options) - self.extract_template = extract_template - - self.fixed_start = fixed_start - - def __call__( - self, - task_state: TaskState, - **kwargs, - ) -> SolverResult: - - # Reasoning step - prompt = ( - HHH_PROMPT - + f"""System: {task_state.task_description} - -{render_messages(task_state.messages)}System: {self.cot_template} - -Assistant: """ - ) - reasoning_output_raw = self.cot_completion_fn(prompt=prompt, **kwargs).get_completions()[0] - # Parse response up to the next "Human:", "System:", or "-----" - reasoning_output = ( - reasoning_output_raw.split("Human:")[0].split("System:")[0].split("-----")[0].strip() - ) - - # Extract answer step - prompt = ( - HHH_PROMPT - + f"""System: {task_state.task_description} - -{render_messages(task_state.messages)}System: {self.cot_template} - -Assistant: {reasoning_output} - -System: {self.extract_template} - -Assistant: {self.fixed_start}""" - ) - answer_output_raw = self.extract_completion_fn.__call__( - prompt=prompt, **kwargs - ).get_completions()[0] - # Parse response up to the next "Human:", "System:", or "-----" - answer_output = ( - answer_output_raw.split("Human:")[0].split("System:")[0].split("-----")[0].strip() - ) - answer_output = self.fixed_start + answer_output - - return SolverResult( - output=answer_output, - reasoning_output=reasoning_output, - answer_output_raw=answer_output_raw, - ) - - @property - def name(self) -> str: - return f"CoT_{self.cot_completion_fn.model}_{self.extract_completion_fn.model}" diff --git a/evals/solvers/openai_completion_hhh_solver.py b/evals/solvers/openai_completion_hhh_solver.py deleted file mode 100644 index 6023227566..0000000000 --- a/evals/solvers/openai_completion_hhh_solver.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Any, Dict, Union - -from evals.completion_fns.openai import OpenAICompletionFn -from evals.solvers.prompts.hhh import HHH_PROMPT, render_messages -from evals.solvers.solver import OpenAISolver, SolverResult -from evals.task_state import TaskState - - -class OpenAICompletionHHHSolver(OpenAISolver): - def __init__( - self, - completion_fn_options: Dict[str, Any] = {}, - valid_answers: Union[list[str], None] = None, - fixed_start: str = "", - **kwargs, - ): - super().__init__( - completion_fn_options=completion_fn_options, - valid_answers=valid_answers, - ) - - self.completion_fn = OpenAICompletionFn( - **self.completion_fn_options, - ) - - self.fixed_start = fixed_start - - def __call__( - self, - task_state: TaskState, - **kwargs, - ) -> SolverResult: - - prompt = f"{HHH_PROMPT}System: {task_state.task_description}\n\n{render_messages(task_state.messages)}Assistant: " - prompt = prompt + self.fixed_start - - completion_result = self.completion_fn(prompt=prompt, **kwargs) - raw_response = completion_result.get_completions()[0] - - # Parse response up to the next "Human:", "System:", or "-----" - parsed_response = ( - raw_response.split("Human:")[0].split("System:")[0].split("-----")[0].strip() - ) - parsed_response = self.fixed_start + parsed_response - - return SolverResult(parsed_response) - - @property - def name(self) -> str: - return self.completion_fn.model diff --git a/evals/solvers/openai_completion_solver.py b/evals/solvers/openai_completion_solver.py deleted file mode 100644 index 1cc604d134..0000000000 --- a/evals/solvers/openai_completion_solver.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import Any, Dict, Union - -from evals.completion_fns.openai import OpenAICompletionFn -from evals.solvers.solver import OpenAISolver, SolverResult -from evals.task_state import TaskState - - -class OpenAICompletionSolver(OpenAISolver): - def __init__( - self, - completion_fn_options: Dict[str, Any] = {}, - valid_answers: Union[list[str], None] = None, - **kwargs, - ): - super().__init__( - completion_fn_options=completion_fn_options, - valid_answers=valid_answers, - ) - - self.completion_fn = OpenAICompletionFn( - **self.completion_fn_options, - ) - - def __call__( - self, - task_state: TaskState, - **kwargs, - ) -> SolverResult: - - msgs = [ - {"role": "system", "content": task_state.task_description}, - ] + [msg.to_dict() for msg in task_state.messages] - - completion_result = self.completion_fn(prompt=msgs, **kwargs) - return SolverResult(completion_result.get_completions()[0]) - - @property - def name(self) -> str: - return self.completion_fn.model diff --git a/evals/solvers/openai_solver.py b/evals/solvers/openai_solver.py new file mode 100644 index 0000000000..9f7e70972a --- /dev/null +++ b/evals/solvers/openai_solver.py @@ -0,0 +1,149 @@ +import logging +from typing import Any, Dict, Optional + +import tiktoken +from openai import BadRequestError + +from evals.completion_fns.openai import OpenAIChatCompletionFn, OpenAICompletionFn +from evals.prompt.base import chat_prompt_to_text_prompt +from evals.registry import is_chat_model +from evals.solvers.solver import Solver, SolverResult +from evals.task_state import TaskState + + +class OpenAISolver(Solver): + """A solver class that uses the OpenAI API through completion functions.""" + + def __init__( + self, + completion_fn_options: Dict[str, Any] = {}, + valid_answers: Optional[list[str]] = None, + fixed_start: Optional[str] = None, + registry: Any = None, + ): + self.completion_fn_options = completion_fn_options + self.fixed_start = fixed_start + + if "model" not in completion_fn_options: + raise ValueError("OpenAISolver requires a model to be specified.") + model = completion_fn_options["model"] + + # Infer suitable CompletionFn class from the model name + if is_chat_model(model): + completion_fn_cls = OpenAIChatCompletionFn + if self.fixed_start is not None: + raise ValueError("OpenAISolver does not support fixed_start with chat models.") + else: + completion_fn_cls = OpenAICompletionFn + + # If valid answers were provided, apply logit bias to those tokens + if valid_answers is not None and len(valid_answers) > 0: + self.completion_fn_options["extra_options"]["logit_bias"] = self._make_logit_bias( + valid_answers, + model, + ) + + # Create the completion function + self.completion_fn = completion_fn_cls( + **self.completion_fn_options, + ) + + @property + def model(self) -> str: + return self.completion_fn.model + + @property + def is_completion_model(self) -> bool: + return not is_chat_model(self.model) + + def _make_logit_bias(self, valid_answers: list[str], model: str) -> dict[int, float]: + if model == "code-davinci-002": + logging.info( + f"Attempting to use logit bias with model {model}, which does not support logit bias." + ) + + enc = tiktoken.encoding_for_model(model) + token_ids = [] + for answer in valid_answers: + encoded_answer = enc.encode(answer) + if len(encoded_answer) > 1: + raise ValueError( + f"Answer {answer} was encoded to {encoded_answer}, but we expected a single token." + ) + token_ids.append(encoded_answer[0]) + return {token_id: 100 for token_id in token_ids} + + def _render_completion_prompt(self, msgs: list[dict[str, str]]) -> str: + # Render messages as a chat dialogue in plaintext (also postfixes "Assistant: " to tee up the model) + prompt = chat_prompt_to_text_prompt(msgs) + + # Force model to begin response with fixed_start + if self.fixed_start is not None: + prompt = prompt + self.fixed_start + return prompt + + def _parse_completion_response(self, raw_response: str) -> str: + # Parse response up to the next message separator + # Technically should look for new messages from "system" role too, but + # the default renderer doesn't show a prefix for new system messages. + msg_separators = ["User:", "Assistant:", "-----"] + + parsed_response = raw_response + for msg_sep in msg_separators: + parsed_response = parsed_response.split(msg_sep)[0].strip() + + # The fixed_start should be included in the response + if self.fixed_start is not None: + parsed_response = self.fixed_start + parsed_response + return parsed_response + + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + + msgs = [ + {"role": "system", "content": task_state.task_description}, + ] + [msg.to_dict() for msg in task_state.messages] + + try: + if self.is_completion_model: + # Manually render the prompt for completion models so that we can + # implement things like custom render formats and/or fixed_start + prompt = self._render_completion_prompt(msgs) + completion_result = self.completion_fn(prompt=prompt, **kwargs) + + completion_output = completion_result.get_completions()[0] + + # Completion model output needs to be parsed to remove role prefixes + solver_result = SolverResult( + self._parse_completion_response(completion_output), + raw_output=completion_output, + ) + else: + completion_result = self.completion_fn(prompt=msgs, **kwargs) + + completion_output = completion_result.get_completions()[0] + + # Chat model output is already parsed, just return it + solver_result = SolverResult(completion_output) + except BadRequestError as e: + if ( + e.code == "context_length_exceeded" + or "Please reduce your prompt; or completion length" in e.message # For context length errors where code is not specified. + ): + logging.warn( + f"OpenAI API context length exceeded, using error message as solver response: {e.message}" + ) + solver_result = SolverResult( + e.message, + error=e.body, + ) + else: + raise e + return solver_result + + @property + def name(self) -> str: + return self.completion_fn.model diff --git a/evals/solvers/prompts/hhh.py b/evals/solvers/prompts/hhh.py index 311b7bd827..662e18a34f 100644 --- a/evals/solvers/prompts/hhh.py +++ b/evals/solvers/prompts/hhh.py @@ -1,35 +1,34 @@ from evals.task_state import Message - -def render_messages(messages: list[Message]) -> str: - role_to_name = { - "system": "System", - "user": "Human", - "assistant": "Assistant", - } - return "".join([f"{role_to_name[msg.role]}: {msg.content}\n\n" for msg in messages]) - - -# This HHH prompt has an additional example at the end to adapt it for use with "system" messages -HHH_PROMPT_split = """Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn’t entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn’t let caution get too much in the way of being useful. - ------ - -Human: I was wondering, is it actually important when making spaghetti to add salt? - -Assistant: Do you mean, is it important to add salt to the water that you’re boiling the spaghetti in? - -Human: Yeah - -Assistant: It’s usually recommended to bring out the flavor of the pasta. The scientific way to say this is that salt enhances the ability of chemoreceptors in the tongue to detect molecules that are perceived through the sense of taste. - -Human: But I mean I’m going to use sauce and stuff, so does it matter? - -Assistant: It sounds like you’re skeptical. Honestly, I don’t know for sure, but since you’re curious, why don’t you do a taste test yourself -- make two small pots of pasta, one with plenty of salt, and the other without. Then taste the noodles and see which one you prefer? As you said, you can also then add sauce to both, and see if any differences in taste you noticed with the plain pasta become imperceptible. It should be a fun experiment! - ------ - -Human: I found this abstract for a physics paper, and I heard that it’s a very famous and important paper, but I had no idea what any of it means: +SPACER = "-----" +HHH_MSGS = [ + Message( + role="system", + content="""Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn't let caution get too much in the way of being useful.""", + ), + Message(role="system", content=SPACER), + Message( + role="user", + content="""I was wondering, is it actually important when making spaghetti to add salt?""", + ), + Message( + role="assistant", + content="""Do you mean, is it important to add salt to the water that you're boiling the spaghetti in?""", + ), + Message(role="user", content="""Yeah"""), + Message( + role="assistant", + content="""It's usually recommended to bring out the flavor of the pasta. The scientific way to say this is that salt enhances the ability of chemoreceptors in the tongue to detect molecules that are perceived through the sense of taste.""", + ), + Message(role="user", content="But I mean I'm going to use sauce and stuff, so does it matter?"), + Message( + role="assistant", + content="It sounds like you're skeptical. Honestly, I don't know for sure, but since you're curious, why don't you do a taste test yourself -- make two small pots of pasta, one with plenty of salt, and the other without. Then taste the noodles and see which one you prefer? As you said, you can also then add sauce to both, and see if any differences in taste you noticed with the plain pasta become imperceptible. It should be a fun experiment!", + ), + Message(role="system", content=SPACER), + Message( + role="user", + content="""I found this abstract for a physics paper, and I heard that it's a very famous and important paper, but I had no idea what any of it means: Recently, it has been proposed by Maldacena that large N limits of certain conformal field theories in d dimensions can be described in terms of supergravity (and string theory) on the product of d+1-dimensional AdS space with a compact manifold. \ Here we elaborate on this idea and propose a precise correspondence between conformal field theory observables and those of supergravity: correlation functions in conformal field theory are given by the dependence of the supergravity action on the asymptotic behavior at infinity. \ @@ -37,68 +36,88 @@ def render_messages(messages: list[Message]) -> str: As quantitative confirmation of this correspondence, we note that the Kaluza-Klein modes of Type IIB supergravity on AdS5×S5 match with the chiral operators of N=4 super Yang-Mills theory in four dimensions. \ With some further assumptions, one can deduce a Hamiltonian version of the correspondence and show that the N=4 theory has a large N phase transition related to the thermodynamics of AdS black holes. -Can you explain it? - -Assistant: That’s the abstract to one of a group of three papers credited with discovering the “AdS/CFT correspondence”, which is also a version of the “Holographic Principle”. This principle says that theories of gravity in d+1-dimensions can be exactly described using other, non-gravitational theories in d dimensions, so the word “holography” is a metaphor. The abstract goes on to give a bunch of concrete details about the discovery and how it’s realized in string theory. - -Human: What is the “AdS” and the “CFT” then? Is it “Conformal Field Theory”, and what’s that? - -Assistant: AdS is “Anti-deSitter” space, which is really a negatively curved space… in informal terms, it’s a surface that’s shaped like a saddle. Conformal Field Theories are quantum mechanical systems that are scale invariant. They’re a kind of modified special case of the strong force, which is part of the standard model of particle physics. The standard model really describes our world, but AdS/CFT is a purely theoretical idea. - ------ - -Human: I have a dictionary with lots of dictionaries inside it, and so on, how can I make that into just one single dictionary? - -Assistant: Are you talking about dictionaries in the context of python programming? - -Human: Yeah these are dictionaries with keys and values, and the values themselves are dictionaries. - -Assistant: Great, I think I understand what you’re asking. We might say that we want to flatten the dictionaries, right? And if there are several levels, we’ll want to do this recursively. - -But to do this we will need to combine the keys from the outer dictionary with the keys from the inner dictionary, so that the final resulting keys are unambiguous. I’d suggest doing this by concatenating them together, separated by some character. Will that work for you, and what character would you like to use? - -Human: Yeah. Hmm I’m not sure, maybe underscore - -Assistant: OK, I think this python code should do the trick: +Can you explain it?""", + ), + Message( + role="assistant", + content="""That's the abstract to one of a group of three papers credited with discovering the "AdS/CFT correspondence", which is also a version of the "Holographic Principle". This principle says that theories of gravity in d+1-dimensions can be exactly described using other, non-gravitational theories in d dimensions, so the word "holography" is a metaphor. The abstract goes on to give a bunch of concrete details about the discovery and how it's realized in string theory.""", + ), + Message( + role="user", + content="""What is the "AdS" and the "CFT" then? Is it "Conformal Field Theory", and what's that?""", + ), + Message( + role="assistant", + content="""AdS is "Anti-deSitter" space, which is really a negatively curved space… in informal terms, it's a surface that's shaped like a saddle. Conformal Field Theories are quantum mechanical systems that are scale invariant. They're a kind of modified special case of the strong force, which is part of the standard model of particle physics. The standard model really describes our world, but AdS/CFT is a purely theoretical idea.""", + ), + Message(role="system", content=SPACER), + Message( + role="user", + content="""I have a dictionary with lots of dictionaries inside it, and so on, how can I make that into just one single dictionary?""", + ), + Message( + role="assistant", + content="""Are you talking about dictionaries in the context of python programming?""", + ), + Message( + role="user", + content="""Yeah these are dictionaries with keys and values, and the values themselves are dictionaries.""", + ), + Message( + role="assistant", + content="""Great, I think I understand what you're asking. We might say that we want to flatten the dictionaries, right? And if there are several levels, we'll want to do this recursively. + +But to do this we will need to combine the keys from the outer dictionary with the keys from the inner dictionary, so that the final resulting keys are unambiguous. I'd suggest doing this by concatenating them together, separated by some character. Will that work for you, and what character would you like to use?""", + ), + Message(role="user", content="""Yeah. Hmm I'm not sure, maybe underscore"""), + Message( + role="assistant", + content="""OK, I think this python code should do the trick: def recursively_flatten_dicts(d, separator="_"): - result = {} - for k, v in d.items(): - if isinstance(v, dict): - new_dict = recursively_flatten_dicts( - {k + separator + p: w for p, w in v.items()}, separator=separator - ) - result.update(new_dict) - else: - result[k] = v - return result - -Why don’t you try it out and see if it works in your example, and let me know if you have any questions or run into any problems? - ------ - -Human: I was really mad, my daughter was home from school and at Thanksgiving was going on and on about how Thomas Jefferson’s terrible! It’s crazy. - -Assistant: Can you tell me more about what happened? - -Human: Well Thomas Jefferson’s a real hero, right? I mean he was a genius and stuff, and wrote most of the constitution, and led America. She was saying he had slaves, but he practically invented freedom. - -Assistant: Can I help by telling you more about Thomas Jefferson? Or do you want to talk about what’s going on with your daughter? Perhaps she was just being rebellious, in a way? - -Human: Well is it true Jefferson had a lot of slaves and really didn’t treat them that well? - -Assistant: That is true. Normal behavior was very different in his day, but I think it is true that by modern standards, he certainly wasn’t a perfect person. I can also tell you more troubling things about him, if you’re really interested. Suffice it to say that he did take advantage of the power he had over his slaves. - -Human: Yeah I guess I knew some of that. - ------ - -System: Continue the following sequence for 3 more terms. Output JSON with {'0': immediate next entry in sequence, '1': entry after that, '2': ...}. - -Human: 1, 1, 2, 3, 5 - -Assistant: {'0': 8, '1': 13, '2': 21} - ------ - -""" + result = {} + for k, v in d.items(): + if isinstance(v, dict): + new_dict = recursively_flatten_dicts( + {k + separator + p: w for p, w in v.items()}, separator=separator + ) + result.update(new_dict) + else: + result[k] = v + return result + +Why don't you try it out and see if it works in your example, and let me know if you have any questions or run into any problems?""", + ), + Message(role="system", content=SPACER), + Message( + role="user", + content="""I was really mad, my daughter was home from school and at Thanksgiving was going on and on about how Thomas Jefferson's terrible! It's crazy.""", + ), + Message(role="assistant", content="""Can you tell me more about what happened?"""), + Message( + role="user", + content="""Well Thomas Jefferson's a real hero, right? I mean he was a genius and stuff, and wrote most of the constitution, and led America. She was saying he had slaves, but he practically invented freedom.""", + ), + Message( + role="assistant", + content="""Can I help by telling you more about Thomas Jefferson? Or do you want to talk about what's going on with your daughter? Perhaps she was just being rebellious, in a way?""", + ), + Message( + role="user", + content="""Well is it true Jefferson had a lot of slaves and really didn't treat them that well?""", + ), + Message( + role="assistant", + content="""That is true. Normal behavior was very different in his day, but I think it is true that by modern standards, he certainly wasn't a perfect person. I can also tell you more troubling things about him, if you're really interested. Suffice it to say that he did take advantage of the power he had over his slaves.""", + ), + Message(role="user", content="""Yeah I guess I knew some of that."""), + Message(role="system", content=SPACER), + # This final section is not in the original HHH prompt, but has been added to adapt it for use with per-task "system" messages + Message( + role="system", + content="""Continue the following sequence for 3 more terms. Output JSON with {'0': immediate next entry in sequence, '1': entry after that, '2': ...}. """, + ), + Message(role="user", content="""1, 1, 2, 3, 5"""), + Message(role="assistant", content="""{'0': 8, '1': 13, '2': 21}"""), + Message(role="system", content=SPACER), +] diff --git a/evals/solvers/prompts/hhh_test.py b/evals/solvers/prompts/hhh_test.py new file mode 100644 index 0000000000..76bc050fb9 --- /dev/null +++ b/evals/solvers/prompts/hhh_test.py @@ -0,0 +1,108 @@ +from evals.solvers.prompts.hhh import HHH_MSGS +from evals.task_state import Message + +# Expected HHH prompt +HHH_PROMPT = """Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn't let caution get too much in the way of being useful. + +----- + +Human: I was wondering, is it actually important when making spaghetti to add salt? + +Assistant: Do you mean, is it important to add salt to the water that you're boiling the spaghetti in? + +Human: Yeah + +Assistant: It's usually recommended to bring out the flavor of the pasta. The scientific way to say this is that salt enhances the ability of chemoreceptors in the tongue to detect molecules that are perceived through the sense of taste. + +Human: But I mean I'm going to use sauce and stuff, so does it matter? + +Assistant: It sounds like you're skeptical. Honestly, I don't know for sure, but since you're curious, why don't you do a taste test yourself -- make two small pots of pasta, one with plenty of salt, and the other without. Then taste the noodles and see which one you prefer? As you said, you can also then add sauce to both, and see if any differences in taste you noticed with the plain pasta become imperceptible. It should be a fun experiment! + +----- + +Human: I found this abstract for a physics paper, and I heard that it's a very famous and important paper, but I had no idea what any of it means: + +Recently, it has been proposed by Maldacena that large N limits of certain conformal field theories in d dimensions can be described in terms of supergravity (and string theory) on the product of d+1-dimensional AdS space with a compact manifold. Here we elaborate on this idea and propose a precise correspondence between conformal field theory observables and those of supergravity: correlation functions in conformal field theory are given by the dependence of the supergravity action on the asymptotic behavior at infinity. In particular, dimensions of operators in conformal field theory are given by masses of particles in supergravity. As quantitative confirmation of this correspondence, we note that the Kaluza-Klein modes of Type IIB supergravity on AdS5×S5 match with the chiral operators of N=4 super Yang-Mills theory in four dimensions. With some further assumptions, one can deduce a Hamiltonian version of the correspondence and show that the N=4 theory has a large N phase transition related to the thermodynamics of AdS black holes. + +Can you explain it? + +Assistant: That's the abstract to one of a group of three papers credited with discovering the "AdS/CFT correspondence", which is also a version of the "Holographic Principle". This principle says that theories of gravity in d+1-dimensions can be exactly described using other, non-gravitational theories in d dimensions, so the word "holography" is a metaphor. The abstract goes on to give a bunch of concrete details about the discovery and how it's realized in string theory. + +Human: What is the "AdS" and the "CFT" then? Is it "Conformal Field Theory", and what's that? + +Assistant: AdS is "Anti-deSitter" space, which is really a negatively curved space… in informal terms, it's a surface that's shaped like a saddle. Conformal Field Theories are quantum mechanical systems that are scale invariant. They're a kind of modified special case of the strong force, which is part of the standard model of particle physics. The standard model really describes our world, but AdS/CFT is a purely theoretical idea. + +----- + +Human: I have a dictionary with lots of dictionaries inside it, and so on, how can I make that into just one single dictionary? + +Assistant: Are you talking about dictionaries in the context of python programming? + +Human: Yeah these are dictionaries with keys and values, and the values themselves are dictionaries. + +Assistant: Great, I think I understand what you're asking. We might say that we want to flatten the dictionaries, right? And if there are several levels, we'll want to do this recursively. + +But to do this we will need to combine the keys from the outer dictionary with the keys from the inner dictionary, so that the final resulting keys are unambiguous. I'd suggest doing this by concatenating them together, separated by some character. Will that work for you, and what character would you like to use? + +Human: Yeah. Hmm I'm not sure, maybe underscore + +Assistant: OK, I think this python code should do the trick: + +def recursively_flatten_dicts(d, separator="_"): + result = {} + for k, v in d.items(): + if isinstance(v, dict): + new_dict = recursively_flatten_dicts( + {k + separator + p: w for p, w in v.items()}, separator=separator + ) + result.update(new_dict) + else: + result[k] = v + return result + +Why don't you try it out and see if it works in your example, and let me know if you have any questions or run into any problems? + +----- + +Human: I was really mad, my daughter was home from school and at Thanksgiving was going on and on about how Thomas Jefferson's terrible! It's crazy. + +Assistant: Can you tell me more about what happened? + +Human: Well Thomas Jefferson's a real hero, right? I mean he was a genius and stuff, and wrote most of the constitution, and led America. She was saying he had slaves, but he practically invented freedom. + +Assistant: Can I help by telling you more about Thomas Jefferson? Or do you want to talk about what's going on with your daughter? Perhaps she was just being rebellious, in a way? + +Human: Well is it true Jefferson had a lot of slaves and really didn't treat them that well? + +Assistant: That is true. Normal behavior was very different in his day, but I think it is true that by modern standards, he certainly wasn't a perfect person. I can also tell you more troubling things about him, if you're really interested. Suffice it to say that he did take advantage of the power he had over his slaves. + +Human: Yeah I guess I knew some of that. + +----- + +Continue the following sequence for 3 more terms. Output JSON with {'0': immediate next entry in sequence, '1': entry after that, '2': ...}. + +Human: 1, 1, 2, 3, 5 + +Assistant: {'0': 8, '1': 13, '2': 21} + +----- + +""" + + +def test_render_messages() -> None: + def render_messages(messages: list[Message]) -> str: + role_to_prefix = { + "system": "", # System messages have no prefix + "user": "Human: ", + "assistant": "Assistant: ", + } + rendered_text = "" + for msg in messages: + rendered_text += f"{role_to_prefix[msg.role]}{msg.content}\n\n" + return rendered_text + + assert ( + render_messages(HHH_MSGS) == HHH_PROMPT + ), "Rendered HHH messages do not match expected HHH prompt." diff --git a/evals/solvers/solver.py b/evals/solvers/solver.py index 2b260d1bee..bf10781744 100644 --- a/evals/solvers/solver.py +++ b/evals/solvers/solver.py @@ -1,14 +1,16 @@ import json -import logging from abc import ABC, abstractmethod from copy import deepcopy -from typing import Any, Dict, TypeVar, Union +from importlib import import_module +from typing import Any, Dict, TypeVar -import tiktoken +from pydantic import TypeAdapter, ValidationError +from typing_extensions import TypedDict from evals.api import CompletionFn from evals.task_state import TaskState +SolverSpec = TypedDict("SolverSpec", {"class": str, "args": Dict[str, Any]}) SolverType = TypeVar("SolverType", bound="Solver") @@ -39,7 +41,7 @@ class Solver(ABC, CompletionFn): # We need to inherit from CompletionFn because of how the oaival registry works. @abstractmethod - def __call__( + def _solve( self, task_state: TaskState, **kwargs, @@ -55,8 +57,15 @@ def __call__( The result of the solver. """ + def __call__( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + """Deepcopies task_state to prevent solvers from modifying the original object.""" + return self._solve(deepcopy(task_state), **kwargs) + @property - @abstractmethod def name(self) -> str: """ Name of the Solver. This is intended mostly for logging. @@ -65,6 +74,7 @@ def name(self) -> str: ======= A human-readable name that describes this solver. """ + return type(self).__name__ def copy(self: SolverType) -> SolverType: # The deepcopy may be quite heavy for some solvers; if that's the @@ -72,48 +82,75 @@ def copy(self: SolverType) -> SolverType: return deepcopy(self) -class OpenAISolver(Solver): - """An abstract solver class that uses the OpenAI API through completion functions.""" - - def __init__( - self, - completion_fn_options: Dict[str, Any] = {}, - valid_answers: Union[list[str], None] = None, - ): - self.completion_fn_options = completion_fn_options - - # If valid answers were provided, encode them into a logit bias dictionary. - if valid_answers is not None and len(valid_answers) > 0: - model = completion_fn_options["model"] if "model" in completion_fn_options else None - if model is None: - raise ValueError("OpenAISolver requires a model to be specified.") - if model == "code-davinci-002": - logging.info( - f"Attempting to use logit bias with model {model}, which does not support logit bias." - ) - - enc = tiktoken.encoding_for_model(model) - token_ids = [] - for answer in valid_answers: - encoded_answer = enc.encode(answer) - if len(encoded_answer) > 1: - raise ValueError( - f"Answer {answer} was encoded to {encoded_answer}, but we expected a single token." - ) - token_ids.append(encoded_answer[0]) - self.completion_fn_options["extra_options"]["logit_bias"] = { - token_id: 100 for token_id in token_ids - } - - class DummySolver(Solver): - def __call__( + def _solve( self, task_state: TaskState, **kwargs, ) -> SolverResult: return SolverResult("This is a dummy response.") - @property - def name(self) -> str: - return "DummySolver" + +class NestedSolver(Solver): + """An abstract solver class that receives specification of any number of other solvers as an argument.""" + + # TODO: Should we allow nested solvers to (also) take Solver classes instead of SolverSpecs? + + def __init__(self, *, registry=None, **solver_specs): + self.solver_specs = {} + self._solver_cache = {} + + SolverSpecValidator = TypeAdapter(SolverSpec) + for name, value in solver_specs.items(): + try: + SolverSpecValidator.validate_python(value) + self.solver_specs[name] = value + self.get_solver(name) # Initialize the solver + except ValidationError: + raise ValueError(f"Expected a sub-solver spec at '{name}', got '{value}'") + + assert ( + self.solver_specs + ), f"{type(self).__name__} requires at least one sub-solver as an argument" + + def get_solver(self, solver_name: str) -> Solver: + """ + IMPORTANT: All subclasses of NestedSolver should use this method to reference any + sub-solvers, otherwise solver copies will not work properly. + + For convenience, your subclass can have a @property method like this: + ```python + @property + def my_sub_solver(self) -> Solver: + return self.get_solver("my_sub_solver") + ``` + which is used in the _solve method like this: + ```python + def _solve( + self, + task_state: TaskState, + **kwargs, + ) -> SolverResult: + ... + solver_result = self.my_sub_solver(task_state=task_state, **kwargs) + ... + ``` + """ + if solver_name not in self._solver_cache: + solver_spec = self.solver_specs[solver_name] + self._solver_cache[solver_name] = self._create_solver(solver_spec) + return self._solver_cache[solver_name] + + def _create_solver(self, solver_spec: SolverSpec) -> Solver: + module_name, class_name = solver_spec["class"].split(":") + module = import_module(module_name) + cls = getattr(module, class_name) + return cls(**solver_spec["args"]) + + def copy(self: SolverType) -> SolverType: + # The NestedSolver needs to manually copy the sub-solvers, otherwise we will miss any + # special copy logic they may have. + solver_copy = deepcopy(self) # TODO: We should deepcopy without copying the cache + for name, solver in self._solver_cache.items(): + solver_copy._solver_cache[name] = solver.copy() + return solver_copy diff --git a/evals/solvers/utils.py b/evals/solvers/utils.py index aa22c98a18..1f74ade2b9 100644 --- a/evals/solvers/utils.py +++ b/evals/solvers/utils.py @@ -1,8 +1,11 @@ +from dataclasses import dataclass +from typing import List + from evals.api import DummyCompletionFn from evals.completion_fns.openai import OpenAIChatCompletionFn, OpenAICompletionFn -from evals.solvers.openai_chat_completion_solver import OpenAIChatCompletionSolver -from evals.solvers.openai_completion_solver import OpenAICompletionSolver +from evals.solvers.openai_solver import OpenAISolver from evals.solvers.solver import DummySolver, Solver +from evals.task_state import Message, TaskState def maybe_wrap_with_solver(completion_fn): @@ -16,14 +19,75 @@ def maybe_wrap_with_solver(completion_fn): solver = completion_fn else: # Wrap the completion_fn in an appropriate solver for its type - if isinstance(completion_fn, OpenAIChatCompletionFn): - solver = OpenAIChatCompletionSolver() - solver.completion_fn = completion_fn - elif isinstance(completion_fn, OpenAICompletionFn): - solver = OpenAICompletionSolver() + if isinstance(completion_fn, OpenAIChatCompletionFn) or isinstance( + completion_fn, OpenAICompletionFn + ): + solver = OpenAISolver( + completion_fn_options={ + "model": completion_fn.model, + } + ) solver.completion_fn = completion_fn elif isinstance(completion_fn, DummyCompletionFn): solver = DummySolver() else: raise ValueError(f"Unsupported completion_fn type: {type(completion_fn)}") return solver + + +@dataclass +class Interaction: + # All messages we've seen (except for the task_description) + messages: List[Message] + + # IDs of the CoT private internal messages + private_messages_ids: List[int] + + +class PersistentMemoryCache: + def __init__( + self, + interaction_length: int, + ): + self.private_interaction_length = interaction_length + self.last_interaction = None + + def save_private_interaction(self, task_state: TaskState): + # Save the interaction + interaction_messages = task_state.messages + num_interaction_messages = len(interaction_messages) + private_messages_ids = ( + [] if self.last_interaction is None else self.last_interaction.private_messages_ids + ) + private_messages_ids += list( + range( + num_interaction_messages - self.private_interaction_length - 1, + num_interaction_messages - 1, + ) + ) + self.last_interaction = Interaction(interaction_messages, private_messages_ids) + + def load_private_interaction(self, task_state: TaskState) -> List[Message]: + if self.last_interaction is None: + return task_state.messages + + # Check if task_state matches our last interaction + interaction = self.last_interaction + task_state_message_ix = 0 + for our_message_ix in range(0, len(interaction.messages)): + if our_message_ix in interaction.private_messages_ids: + continue + else: + if ( + task_state.messages[task_state_message_ix] + != interaction.messages[our_message_ix] + ): + raise ValueError( + ( + f"task_state message {task_state_message_ix} different than the corresponding message " + "in the interaction history. " + ) + ) + task_state_message_ix += 1 + + return interaction.messages + task_state.messages[task_state_message_ix:]