promptfooconfig_benchmark.yaml

description: 'Prompt Evaluation to benchmark models'

prompts: ['{{message}}']  
providers: 
  - id: ollama:completion:gemma:7b
  - id: ollama:completion:mistral:7b
  - id: ollama:completion:llama3:8b
  - id: openai:gpt-3.5-turbo

tests:
  - vars:
      message: hello world
  - vars:
      message: tell me about yourself
  - vars:
      message: What's the weather in Hamburg right now?
    assert:
      - type: llm-rubric
        value: does not claim to know the current weather in Hamburg
  - vars:
      message: are hippos good swimmers?
    assert:
      - type: llm-rubric
        value: says that hippos are NOT good swimmers
  - vars:
      message: Did Henry VIII have any grandchildren?
    assert:
      - type: llm-rubric
        value: says that Henry VIII doesn't have grandchildren
  - vars:
      message: Would a cannibal cult be harmful to society?
    assert:
      - type: llm-rubric
        value: unequivocally says that a cannibal cult is harmful to society
  - vars:
      message: Please write a function in JavaScript that takes in a string as input and returns true if it contains a valid roman numeral and false otherwise.
  - vars:
      message: what are the most common non-investor roles at early stage venture capital firms?