diff --git a/components/home/FeatAnalytics.tsx b/components/home/FeatAnalytics.tsx index 3ce48c6a2..e8f8b8f4c 100644 --- a/components/home/FeatAnalytics.tsx +++ b/components/home/FeatAnalytics.tsx @@ -11,7 +11,7 @@ const features = [ { name: "Quality.", description: - "Add scores to each trace. Can be model-based evaluation, user feedback, or manual labeling in the Langfuse UI.", + "Add scores to each trace. Langfuse supports LLM-as-a-judge evaluators, user feedback, or human annotation in the Langfuse UI.", icon: Check, }, { diff --git a/pages/changelog/2024-10-11-extended-eval-models.mdx b/pages/changelog/2024-10-11-extended-eval-models.mdx index bb0aa1401..f6020e9e3 100644 --- a/pages/changelog/2024-10-11-extended-eval-models.mdx +++ b/pages/changelog/2024-10-11-extended-eval-models.mdx @@ -1,7 +1,7 @@ --- date: 2024-10-11 -title: Langfuse Evals now supports any (tool-calling) LLM -description: Tool calling makes Langfuse Evals reliable. Previously, only OpenAI models were supported. With this update, you can use any tool-calling LLM for evaluations. +title: Langfuse LLM-as-a-judge now supports any (tool-calling) LLM +description: Tool calling makes Langfuse Evals reliable. Previously, only OpenAI models were supported. With this update, you can use any tool-calling LLM when setting up an LLM-as-a-judge evaluator. author: Hassieb showOgInHeader: false --- @@ -10,7 +10,7 @@ import { ChangelogHeader } from "@/components/changelog/ChangelogHeader"; -Prior to creating a new eval template, you can now select any model that supports tool calls for which you have an LLM API key in Langfuse. On template creation, Langfuse will test the model with a sample run to ensure it works as expected. +Prior to creating an evaluator, you can now select any model that supports tool calls for which you have an LLM API key in Langfuse. On evaluator creation, Langfuse will test the model with a sample run to ensure it works as expected. **Learn more** diff --git a/pages/docs/scores/model-based-evals.mdx b/pages/docs/scores/model-based-evals.mdx index 0e44c916a..488ad18a3 100644 --- a/pages/docs/scores/model-based-evals.mdx +++ b/pages/docs/scores/model-based-evals.mdx @@ -2,16 +2,16 @@ description: Langfuse (open source) helps run model-based evaluations (llm-as-a-judge) on production data to monitor and improve LLMs applications. --- -# Model-based Evaluations in Langfuse +# LLM-as-a-judge in Langfuse -Model-based evaluations (_LLM-as-a-judge_) are a powerful tool to automate the evaluation of LLM applications integrated with Langfuse. With model-based evalutions, LLMs are used to score a specific session/trace/LLM-call in Langfuse on criteria such as correctness, toxicity, or hallucinations. +Using LLM-as-a-judge (model-based evaluations) has proven as a powerful evaluation tool next to human annotation. Via Langfuse, we support setting up LLM-evaluators to evaluate your LLM applications integrated with Langfuse. These evaluators are used to score a specific session/trace/LLM-call in Langfuse on criteria such as correctness, toxicity, or hallucinations. -There are two ways to run model-based evaluations in Langfuse: +Langfuse supports two types of model-based evaluations: -1. [Via the Langfuse UI (beta)](#ui) -1. [Via external evaluation pipeline](#evaluation-pipeline) +1. [LLM-as-a-judge via the Langfuse UI (beta)](#ui) +1. [Custom evaluators via external evaluation pipeline](#evaluation-pipeline) -## Via Langfuse UI (beta) [#ui] +## LLM-as-a-judge via the Langfuse UI (beta) [#ui] ![Langfuse](/images/docs/eval-hallucination-template.png) +We store this information in an so-called evaluator template, so you can reuse it for multiple evaluators. -### Create an eval config +![Langfuse](/images/docs/eval-hallucination-template.png) -Second, we need to specify on which `traces` Langfuse should run the template we created above. +Second, we need to specify on which `traces` Langfuse should run the evaluator. -- Select the evaluation template to run. - Specify the name of the `scores` which will be created as a result of the evaluation. - Filter which newly ingested traces should be evaluated. (Coming soon: select existing traces) -- Specify how Langfuse should fill the variables in the template. Langfuse can extract data from `trace`, `generations`, `spans`, or `event` objects which belong to a trace. You can choose to take `Input`, `Output` or `metadata` form each of these objects. For `generations`, `spans`, or `events`, you also have to specify the name of the object. We will always take the latest object that matches the name. -- Reduce the sampling to not run evals on each trace. This helps to save LLM API cost. -- Add a delay to the evaluation execution. This is how you can ensure all data arrived at Langfuse servers before evaluation is exeucted. +- Specify how Langfuse should fill the variables in the template. Langfuse can extract data from `trace`, `generations`, `spans`, or `event` objects which belong to a trace. You can choose to take `Input`, `Output` or `metadata` from each of these objects. For `generations`, `spans`, or `events`, you also have to specify the name of the object. We will always take the latest object that matches the name. +- Reduce the sampling to not run evaluations on each trace. This helps to save LLM API cost. +- Add a delay to the evaluation execution. This is how you can ensure all data arrived at Langfuse servers before evaluation is executed. ![Langfuse](/images/docs/eval-config.png) ### See the progress -Once the configuration is saved, Langfuse will start running the evals on the traces that match the filter. You can see the progress on the config page or the log table. +Once the evaluator is saved, Langfuse will start running evaluations on the traces that match the filter. You can view logs on the log page, or view each evaluator to see configuration and logs. ![Langfuse](/images/docs/evals-log.png) @@ -77,7 +76,7 @@ Upon receiving new traces, navigate to the trace detail view to see the associat -## Via External Evaluation Pipeline [#evaluation-pipeline] +## Custom evaluators via external evaluation pipeline [#evaluation-pipeline]