diff --git a/docs/docs/benchmark.mdx b/docs/docs/benchmark.mdx index 95016d8..e366193 100644 --- a/docs/docs/benchmark.mdx +++ b/docs/docs/benchmark.mdx @@ -30,6 +30,8 @@ Some of the LLMs above require using custom libraries to post-process LLM genera `Nexusflow/NexusRaven-V2-13B` required [nexusraven-pip](https://github.com/nexusflowai/nexusraven-pip). +`functionary-small-v2.5` and `functionary-medium-v3.0` models are tested using [MeetKai's functionary](https://github.com/MeetKai/functionary?tab=readme-ov-file#setup) with the vllm framework. For each model, we compared the results with functionary's `Grammar Sampling` feature enabled and disabled, taking the highest score from either configuration. The `functionary-small-v2.5` model achieved a higher score than the `functionary-medium-v3.0` model, primarily due to the medium model exhibiting more hallucinations in some of our more advanced test cases. + ::::: ∔ `Nexusflow/NexusRaven-V2-13B` and `gorilla-llm/gorilla-openfunctions-v2` don't accept tool observations, the result of running a tool or function once the LLM calls it, so we appended the observation to the prompt. \ No newline at end of file diff --git a/docs/src/components/BenchmarkTable.js b/docs/src/components/BenchmarkTable.js index 94f54ae..f2ee978 100644 --- a/docs/src/components/BenchmarkTable.js +++ b/docs/src/components/BenchmarkTable.js @@ -200,6 +200,26 @@ const data = [ gsm8k: '-', math: '-', mtBench:'-', + }, + { + model: 'functionary-medium-v3.0', + params: 70.6, + functionCalling: '46.43%', + mmlu: '79.85', + gpqa: '38.39', + gsm8k: '89.54', + math: '43.02', + mtBench:'5.49', + }, + { + model: 'functionary-small-v2.5', + params: 8.03, + functionCalling: '57.14%', + mmlu: '63.92', + gpqa: '32.14', + gsm8k: '66.11', + math: '20.54', + mtBench:'7.09', } ];