diff --git a/index.toml b/index.toml index 0ab71d8..a336426 100644 --- a/index.toml +++ b/index.toml @@ -266,4 +266,12 @@ notebook = "newsletter-agent.ipynb" new = true experimental = true topics = ["Function Calling", "Chat", "Agents"] -discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/98" \ No newline at end of file +discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/98" + +[[cookbook]] +title = "Evaluating AI with Haystack" +notebook = "evaluating_ai_with_haystack.ipynb" +new = true +experimental = true +topics = ["Evaluation"] +discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/74" \ No newline at end of file diff --git a/notebooks/evaluating_ai_with_haystack.ipynb b/notebooks/evaluating_ai_with_haystack.ipynb new file mode 100644 index 0000000..44fe5c0 --- /dev/null +++ b/notebooks/evaluating_ai_with_haystack.ipynb @@ -0,0 +1,13613 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "uriHEO8pkgSo" + }, + "source": [ + "# Evaluating AI with Haystack\n", + "\n", + "by Bilge Yucel ([X](https://x.com/bilgeycl), [Linkedin](https://www.linkedin.com/in/bilge-yucel/))\n", + "\n", + "In this cookbook, we walktrough the [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) in Haystack, create an evaluation pipeline, streamline the evaluation with [`EvaluationHarness`](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness) and try different Evaluation Frameworks like [Ragas](https://haystack.deepset.ai/integrations/ragas) and [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge). \n", + "\n", + "πŸ“š **Useful Resources:**\n", + "* [Article: Benchmarking Haystack Pipelines for Optimal Performance](https://haystack.deepset.ai/blog/benchmarking-haystack-pipelines)\n", + "* [Evaluation Walkthrough](https://haystack.deepset.ai/tutorials/guide_evaluation)\n", + "* [haystack-evaluation repo](https://github.com/deepset-ai/haystack-evaluation/tree/main)\n", + "* [EvaluationHarness (haystack-experimental)](https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness)\n", + "* [Evaluation tutorial](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines)\n", + "* [Evaluation Docs](https://docs.haystack.deepset.ai/docs/evaluation)\n", + "\n", + "## πŸ“Ί Watch Along\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "toc", + "id": "WI3_y1HNGiqQ" + }, + "source": [ + ">[Evaluating AI with Haystack](#scrollTo=uriHEO8pkgSo)\n", + "\n", + ">[Building your pipeline](#scrollTo=C_WUXQzEQWv8)\n", + "\n", + ">>[ARAGOG](#scrollTo=Dms5Ict6NGXq)\n", + "\n", + ">[Human Evaluation](#scrollTo=zTbmQzeXQY1F)\n", + "\n", + ">[Deciding on Metrics](#scrollTo=-U-QnCBqQcd6)\n", + "\n", + ">[Building an Evaluation Pipeline](#scrollTo=yLkAcM_5Qfat)\n", + "\n", + ">[Running Evaluation](#scrollTo=p76stWMQQmPD)\n", + "\n", + ">>>[Run the RAG Pipeline](#scrollTo=rUfQQzusXhgk)\n", + "\n", + ">>>[Run the Evaluation](#scrollTo=mfepD9HwXk4Q)\n", + "\n", + ">[Analyzing Results](#scrollTo=mC_mIqdMQqZG)\n", + "\n", + ">>[Evaluation Harness (Step 4, 5, and 6)](#scrollTo=OmkHqAsQZhFr)\n", + "\n", + ">[Evaluation Frameworks](#scrollTo=gKfrFf1CebJJ)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "jOtY28-vJs35" + }, + "outputs": [], + "source": [ + "!pip install haystack-ai sentence-transformers>=\"3.0.0\" pypdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "asqKdrFP4yCA", + "outputId": "6123e2fd-2be4-41c2-ac7f-2b71295b0e81" + }, + "outputs": [], + "source": [ + "!pip install ragas-haystack flow-judge[hf] # evaluation frameworks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C_WUXQzEQWv8" + }, + "source": [ + "# 1. Building your pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dms5Ict6NGXq" + }, + "source": [ + "## ARAGOG\n", + "\n", + "This dataset is based on the paper [Advanced Retrieval Augmented Generation Output Grading (ARAGOG)](https://arxiv.org/pdf/2404.01037). It's a\n", + "collection of papers from ArXiv covering topics around Transformers and Large Language Models, all in PDF format.\n", + "\n", + "The dataset contains:\n", + "- 13 PDF papers.\n", + "- 107 questions and answers generated with the assistance of GPT-4, and validated/corrected by humans.\n", + "\n", + "We have:\n", + "- ground-truth answers\n", + "- questions\n", + "\n", + "Source: https://github.com/deepset-ai/haystack-evaluation/blob/main/datasets/README.md" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 193, + "referenced_widgets": [ + "63a2250eeac3468cabbeea4d4aa737da", + "61ccdd2e814d440c8dfc126007d6942f", + "3f932254e79f4dc894c2a09c9d6f3aa9", + "370eda14b4db451e8e0d567b336f1357", + "ccf595bc10094c468aa1d7e3db5e2001", + "d498664616c744fe914e8ccca0fcc5d9", + "b23e940bb57d45f5a660d56f1a5de12c", + "505907ef6eba4f558a4b2a17791c1057", + "f2e56c86e07c46eaa2b192977021565a", + "2573bf08f6274e709759b664c5c35b0a", + "15377ea70c0548049f90993b7edca597", + "e532542b46164c59a585eed928f954b7", + "5851d89610744d0388ba40fc24ca2029", + "ddf9042eeff94c81b0153db06238b8ac", + "fbdfb8b539684373a1ec4d0b37242aa6", + "82434a705be54bc6a296a4ebecb6cdbb", + "c091207cae2940d29fe445132fe36f6a", + "d96185207ec449c0ab128c2fbd427f27", + "256cddebfac24ca1bf5ac2618edb6432", + "c2f6cd4d91bf4368bc1018cbda44c915", + "c55a9a25325c4a82b977abd962d809e7", + "f4bca807b90d462983a29393d9533c7a", + "a44f7688d3014e3291397457dac7588b", + "da33fab43a554e1599b69d32833f6866", + "aa6786192b6f47dca6e74b7a637025a2", + "631a54146c824ad2beccfdcd7415adaf", + "3d7894a9bd364127b6f4bf2fdbc9ccda", + "97baa482ea8941b7b4690355cb0dbfab", + "c7885fabac1845a68f87a9851f990a61", + "30bfabda14d942b7823c37c00ca09a44", + "de823f9a25174ffb8a96593837f1dc4b", + "231f709581a64f7986f4d03609d14e9b", + "d0bddde6fa49451fb3dca4ce2d0be43e", + "01866330e5b344a08ec8f93164d30b97", + "2489569c1c064626b9643ba405fecc17", + "cf9656d7589649a2b9bbcaf804bdf74e", + "cf99f5f162d346ed9ea9b5649a7f0bce", + "4cf8257e422840bba63046487e7ba8d9", + "4ca58cf2ce034b568b737701a1e22893", + "8e887c552ece49de8b0b267447701ccf", + "f1a68ff6ddb14c0ebe3d23e541993a39", + "441aea8c41484163b0fea778105177cf", + "6e5aa6c450c149bca8feb8ab54574342", + "c0d174f917ed408ebb43ea89b6123b00", + "44e4dafde124442fb8ec03cca27fbfec", + "ec415fb1fee74460835ec20ba868ce28", + "1c945fce1a834897976cdf65203d9359", + "f5a46e3237404d4d8f2f5bf28ddb5b15", + "a3cb6353fb834ff5915b4dbaadf4b4c3", + "ccb687db5a8e4502979d8b80f17d864e", + "4f3c64d7a0554970bbd768650c1f8087", + "645abc85f3024837adc8d8b0c1380f44", + "aaebedbdfe3e4592ab8bb5aef600386e", + "1ce2816e564d4afd9531639914d4d002", + "26a5ae86d1604d41bc0bf97257d241ef", + "8a296418e81549d68bd6e7540fe58411", + "81bdb2ec8f584de58c58bdd1f710507d", + "368a19bb275d47ba91e19198b0f00e52", + "54f986fe1fb64fd3992871245674f717", + "50863e3929e54ac092116a98ad126096", + "a8ce80d082584584886b891fcf753b70", + "7fac1705485d45d28e915962d22b5704", + "2b3aaef9c8ae434ca49454991a565e3b", + "3480b56144164c93992f962a5ae5ec68", + "021f75fbbb4c4cd39850724e5658ceb5", + "198bd89d6d0d4fca8eab21f52c101e64", + "c17b2dd231e34437af28e271d5fef893", + "82874942fd564d4c9ee822a76620f408", + "3685f8efff6d413f94a0277231d1e07a", + "fc2847d3341947a09fc5ba6c3e0f8dee", + "d6c42191eb4147beafefce541234df5e", + "902bc8a6be114106a3ff0cc5e8c279e2", + "0a5a23affad44ca9924233997001773c", + "5292150749354885842137c42b859410", + "6cc16aa262c94fa2b6d50ec05cac03e2", + "1a9f640ea2f3404bb4cf833d8ee4a504", + "f15c8b65d5cb46cb8d54504638ed8354", + "082721263d2849f3ae103627c4ed5726", + "e60a261e229449f5a8dd52b65164b76a", + "7c54b173b0774419b493a266d62296f8", + "adfb074b0802452ebe33cdf334e42f82", + "86cb846f6e53472c959ee0fb3f10414d", + "ac9c6339a14547c183141cde27a4e636", + "fbca2085e8424045940184c102fff1eb", + "be8fa174e9ed4df4908c48c6d91f6b61", + "d3d9aff5076c4401a629e55984961ad5", + "634b2e381f9144018796cd7b36a33348", + "4f9d0a50e98348d9a5ada7dc3f6d0bbf", + "8ad5409906b5415cbfd2122e905f4b22", + "287481e1430a4142b1025f5b42aac459", + "d955dd6963a24daf8fc9fdea5bce73d0", + "9d01e7dac83246e197b435eb18b8e24e", + "b10688f66a60455cba45660c00330d05", + "1ce6bedd727e4b5985a0c549d1e2f434", + "c1eb7fa89c3c4528a7d23152b2b63630", + "ffcf6c83efd94681a70e20333712d883", + "320a6af43c0c4997a41a4a7e3bbb55da", + "db3d178744d541519df4d63183fc0507", + "e09ba7aa34bf4756ac85ec2749c8b6ab", + "838ae6fde98149a5b506dd4f4b1f96f0", + "cdf97d9d96a5462ebc25888f4d63aa6c", + "ca62e5b3498b41a5bf9f5143aa6bd6a5", + "8f5e5ed8a4db43018f1f9faae4e88505", + "09c0a4e916b241d3b41190d72ef52e30", + "868f0b6a0404492c8d210ac301a5ddaa", + "34622bf1f4ce4ab696c2db62af51f6aa", + "05d5ef00e89d4d2e8591e2e8202211fc", + "7d7cec045ca64b4bbb0cfd00eedc9872", + "d79cb73f2d4a4421bb7101e838a8c696", + "a28eb87083c64b37b5906749f280061f", + "b3e900b3b32144998dee7dce0e41fb95", + "8759d1dea3094361a3f15cdd5f918b07", + "af11c39be6a74322ba3d3a29e665310b", + "8279362c4e594717b75ac8e62b5ce292", + "30c2ac05df8d4380bb15330e22b222b3", + "f23906ff1ec048fc96a8bb84344c349d", + "5e2efb0b46ab49a29f36393902936fd8", + "c548cd0a428643cc8ec53c8dab844257", + "1209eb96bec9449088f1b92d52532d9f", + "aac78527b4f2478fbaf4476a4fbef263", + "cbeb17c359094ffdbe9f90810b329ba6", + "8cac6309504c4e9fbbb8a0c19df99278", + "68385a2690aa450395aa6968e608bbfb", + "340d238889c3472187adfb787bf547d1", + "aa8f7b2e52b6437d8b38f1c4af74bcbc", + "1e64cc4d56684457931f7d383502012e", + "e2b228cc746c4639a92707b1ece82d24", + "3bd88ff892684f9a864eeeadb7bb3138", + "b6b86e3e25854a7c953b988818773d37", + "1c063c12f0d6483a9ba1d3177a5a4b15", + "90c6345aeb9b4698a7bc3e0270b5ba5c", + "2f23727f57dc4fd1a2274b3e957a0bc6" + ] + }, + "id": "1FOM5mr7K7UU", + "outputId": "afe37c76-bb4b-410f-b06f-3cd9e72f6c04" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from haystack import Pipeline\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.components.converters import PyPDFToDocument\n", + "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", + "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", + "from haystack.components.writers import DocumentWriter\n", + "from haystack.document_stores.types import DuplicatePolicy\n", + "\n", + "embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + "document_store = InMemoryDocumentStore()\n", + "\n", + "files_path = \"/content/papers_for_questions\"\n", + "pipeline = Pipeline()\n", + "pipeline.add_component(\"converter\", PyPDFToDocument())\n", + "pipeline.add_component(\"cleaner\", DocumentCleaner())\n", + "pipeline.add_component(\"splitter\", DocumentSplitter(split_length=250, split_by=\"word\")) # default splitting by word\n", + "pipeline.add_component(\"writer\", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))\n", + "pipeline.add_component(\"embedder\", SentenceTransformersDocumentEmbedder(embedding_model))\n", + "pipeline.connect(\"converter\", \"cleaner\")\n", + "pipeline.connect(\"cleaner\", \"splitter\")\n", + "pipeline.connect(\"splitter\", \"embedder\")\n", + "pipeline.connect(\"embedder\", \"writer\")\n", + "pdf_files = [files_path+\"/\"+f_name for f_name in os.listdir(files_path)]\n", + "\n", + "pipeline.run({\"converter\": {\"sources\": pdf_files}})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eBV3-XqaM-QT", + "outputId": "f0618a8b-3424-4943-8c8b-5a5397155d01" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "690" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "document_store.count_documents()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jqhpts_R_RQK" + }, + "source": [ + "### RAG" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "ltjnGARjVng3" + }, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass('OPENAI_API_KEY: ')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "0HA4suneP5Ww", + "outputId": "e9b23191-d971-4631-b158-00aaab063dad" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "πŸš… Components\n", + " - query_embedder: SentenceTransformersTextEmbedder\n", + " - retriever: InMemoryEmbeddingRetriever\n", + " - prompt_builder: PromptBuilder\n", + " - generator: OpenAIGenerator\n", + "πŸ›€οΈ Connections\n", + " - query_embedder.embedding -> retriever.query_embedding (List[float])\n", + " - retriever.documents -> prompt_builder.documents (List[Document])\n", + " - prompt_builder.prompt -> generator.prompt (str)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack import Pipeline\n", + "from haystack.components.builders import PromptBuilder, AnswerBuilder\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.retrievers import InMemoryEmbeddingRetriever\n", + "\n", + "template = \"\"\"\n", + " You have to answer the following question based on the given context information only.\n", + " If the context is empty or just a '\\\\n' answer with None, example: \"None\".\n", + "\n", + " Context:\n", + " {% for document in documents %}\n", + " {{ document.content }}\n", + " {% endfor %}\n", + "\n", + " Question: {{question}}\n", + " Answer:\n", + " \"\"\"\n", + "\n", + "basic_rag = Pipeline()\n", + "basic_rag.add_component(\"query_embedder\", SentenceTransformersTextEmbedder(\n", + " model=embedding_model, progress_bar=False\n", + "))\n", + "basic_rag.add_component(\"retriever\", InMemoryEmbeddingRetriever(document_store))\n", + "basic_rag.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + "basic_rag.add_component(\"generator\", OpenAIGenerator(model=\"gpt-4o-mini\"))\n", + "\n", + "basic_rag.connect(\"query_embedder\", \"retriever.query_embedding\")\n", + "basic_rag.connect(\"retriever\", \"prompt_builder.documents\")\n", + "basic_rag.connect(\"prompt_builder\", \"generator\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zTbmQzeXQY1F" + }, + "source": [ + "# 2. Human Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "id": "DCAJIn3gQbv1" + }, + "outputs": [], + "source": [ + "from typing import List, Tuple\n", + "import json\n", + "\n", + "def read_question_answers() -> Tuple[List[str], List[str]]:\n", + " with open(\"/content/eval_questions.json\", \"r\") as f:\n", + " data = json.load(f)\n", + " questions = data[\"questions\"]\n", + " answers = data[\"ground_truths\"]\n", + " return questions, answers\n", + "\n", + "all_questions, all_answers = read_question_answers()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "XSeU0xFPSbAi", + "outputId": "b8e5ab0e-11e5-4610-cdf3-3dd9095b6ff4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "107\n", + "107\n" + ] + } + ], + "source": [ + "print(len(all_questions))\n", + "print(len(all_answers))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "id": "esFxSGF0nfTJ" + }, + "outputs": [], + "source": [ + "questions = all_questions[:15]\n", + "answers = all_answers[:15]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "BSj3dUnASfS_", + "outputId": "8cbf15d6-6d02-46ce-9022-18952d02e8da" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "How were the questions for the multitask test sourced, and what was the criteria for their inclusion?\n", + "Questions were manually collected by graduate and undergraduate students from freely available online sources, including practice questions for standardized tests and undergraduate courses, ensuring a wide representation of difficulty levels and subjects.\n" + ] + } + ], + "source": [ + "index = 5\n", + "print(questions[index])\n", + "print(answers[index])\n", + "question = questions[index]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "jIIXQu0LV8dU", + "outputId": "b51c5b29-da7b-4620-adff-2b424c4a016e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'generator': {'replies': ['The questions for the multitask test were manually collected by graduate and undergraduate students from freely available sources online. These sources included practice questions for tests such as the Graduate Record Examination and the United States Medical Licensing Examination, as well as questions designed for undergraduate courses and those for readers of Oxford University Press books. The criteria for inclusion were based on ensuring that the questions covered a range of subjects and difficulty levels, including specific tasks like \"Elementary,\" \"High School,\" \"College,\" or \"Professional,\" with each subject containing a minimum of 100 test examples.'],\n", + " 'meta': [{'model': 'gpt-4o-mini-2024-07-18',\n", + " 'index': 0,\n", + " 'finish_reason': 'stop',\n", + " 'usage': {'completion_tokens': 110,\n", + " 'prompt_tokens': 4559,\n", + " 'total_tokens': 4669,\n", + " 'completion_tokens_details': CompletionTokensDetails(audio_tokens=None, reasoning_tokens=0),\n", + " 'prompt_tokens_details': PromptTokensDetails(audio_tokens=None, cached_tokens=0)}}]}}" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "basic_rag.run({\"query_embedder\":{\"text\":question}, \"prompt_builder\":{\"question\": question}})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-U-QnCBqQcd6" + }, + "source": [ + "# 3. Deciding on Metrics\n", + "\n", + "* **Semantic Answer Similarity**: SASEvaluator compares the embedding of a generated answer against a ground-truth answer based on a common embedding model.\n", + "* **ContextRelevanceEvaluator** will assess the relevancy of the retrieved context to answer the query question\n", + "* **FaithfulnessEvaluator** evaluates whether the generated answer can be derived from the context\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yLkAcM_5Qfat" + }, + "source": [ + "# 4. Building an Evaluation Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "id": "NN_M_UZYQkJ-" + }, + "outputs": [], + "source": [ + "from haystack import Pipeline\n", + "from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator\n", + "\n", + "eval_pipeline = Pipeline()\n", + "eval_pipeline.add_component(\"context_relevance\", ContextRelevanceEvaluator(raise_on_failure=False))\n", + "eval_pipeline.add_component(\"faithfulness\", FaithfulnessEvaluator(raise_on_failure=False))\n", + "eval_pipeline.add_component(\"sas\", SASEvaluator(model=embedding_model))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p76stWMQQmPD" + }, + "source": [ + "# 5. Running Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rUfQQzusXhgk" + }, + "source": [ + "### Run the RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "id": "Wmhn-WYnQmjv" + }, + "outputs": [], + "source": [ + "predicted_answers = []\n", + "retrieved_context = []\n", + "\n", + "for question in questions: # loops over 15 questions\n", + " result = basic_rag.run({\"query_embedder\":{\"text\":question}, \"prompt_builder\":{\"question\": question}}, include_outputs_from={\"retriever\"})\n", + " predicted_answers.append(result[\"generator\"][\"replies\"][0])\n", + " retrieved_context.append(result[\"retriever\"][\"documents\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mfepD9HwXk4Q" + }, + "source": [ + "### Run the Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "6QuMPaJBYGbo", + "outputId": "ab0ef478-8743-40ee-d1a3-c50b1f965e64" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 15/15 [00:11<00:00, 1.26it/s]\n", + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 15/15 [00:35<00:00, 2.37s/it]\n" + ] + } + ], + "source": [ + "eval_pipeline_results = eval_pipeline.run(\n", + " {\n", + " \"context_relevance\": {\"questions\": questions, \"contexts\": retrieved_context},\n", + " \"faithfulness\": {\"questions\": questions, \"contexts\": retrieved_context, \"predicted_answers\": predicted_answers},\n", + " \"sas\": {\"predicted_answers\": predicted_answers, \"ground_truth_answers\": answers},\n", + " }\n", + ")\n", + "\n", + "results = {\n", + " \"context_relevance\": eval_pipeline_results['context_relevance'],\n", + " \"faithfulness\": eval_pipeline_results['faithfulness'],\n", + " \"sas\": eval_pipeline_results['sas']\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mC_mIqdMQqZG" + }, + "source": [ + "# 6. Analyzing Results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bmamofudbWDw" + }, + "source": [ + "[EvaluationRunResult](https://docs.haystack.deepset.ai/reference/evaluation-api#evaluationrunresult)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "collapsed": true, + "id": "hkfPmQf6Qq6L", + "outputId": "4d6012bc-d0de-4714-c177-10d6e23965dd" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"eval_results\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"metrics\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"context_relevance\",\n \"faithfulness\",\n \"sas\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2209887466070297,\n \"min\": 0.2,\n \"max\": 0.6111111111111112,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.2,\n 0.6111111111111112,\n 0.5460860473414262\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0context_relevance0.200000
1faithfulness0.611111
2sas0.546086
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " metrics score\n", + "0 context_relevance 0.200000\n", + "1 faithfulness 0.611111\n", + "2 sas 0.546086" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack.evaluation import EvaluationRunResult\n", + "\n", + "inputs = {\n", + " 'questions': questions,\n", + " 'contexts': retrieved_context,\n", + " 'true_answers': answers,\n", + " 'predicted_answers': predicted_answers\n", + "}\n", + "run_name=\"rag_eval\"\n", + "eval_results = EvaluationRunResult(run_name=run_name, inputs=inputs, results=results)\n", + "eval_results.score_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 849 + }, + "collapsed": true, + "id": "f1rGBuvocwLB", + "outputId": "d607b987-2f03-47dd-f3ee-0cedd73fe4b9" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"eval_results\",\n \"rows\": 15,\n \"fields\": [\n {\n \"column\": \"questions\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n \"What are the two main tasks BERT is pre-trained on?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"contexts\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"true_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"DetectGPT's detection approach is significant in the context of evolving large language model (LLM) capabilities as it addresses the challenges posed by these models, which are increasingly adept at generating coherent and convincing text. As LLMs improve, they become more attractive tools for various applications, including education and journalism; however, this rise brings with it the potential for misuse, such as the production of misleading or inaccurate information. \\n\\nDetectGPT leverages the hypothesis that machine-generated texts typically occupy regions of negative curvature in the log probability function of the generating model. By comparing the log probabilities of original passages with those of slightly modified perturbations, DetectGPT can effectively differentiate between human-written and model-generated text. This method not only enhances detection accuracy but also adapts to the rapid advancements in LLMs, making it a timely solution as these models proliferate.\\n\\nFurthermore, as LLMs are deployed in real-world contexts, such as automatic essay writing or news generation with minimal human oversight, the risks of fraudulence and misinformation escalate. DetectGPT provides a potential safeguard by improving the reliability of detecting AI-generated content, helping educators and news consumers ascertain the authenticity of the texts they engage with. \\n\\nOverall, DetectGPT's innovative detection strategy is crucial for mitigating the associated risks of LLM misuse, ensuring that the growing use of these technologies does not compromise academic integrity or the credibility of information disseminated to the public. Its ongoing development and improvement could serve as a vital countermeasure in an era of AI where trustworthiness becomes increasingly difficult to verify.\",\n \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It mentions that models need \\\"substantial improvements\\\" and highlights issues with accuracy on socially important subjects like morality and law, but does not detail specific enhancements. Therefore, the answer is:\\n\\nNone\",\n \"The two main tasks BERT is pre-trained on are the \\\"masked language model\\\" (MLM) task and the \\\"next sentence prediction\\\" (NSP) task.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.46147910349544863,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.0,\n 0.6666666666666666\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3054620861350202,\n \"min\": -0.005470070987939835,\n \"max\": 0.9627792835235596,\n \"num_unique_values\": 15,\n \"samples\": [\n -0.005470070987939835,\n 0.3704226613044739\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextstrue_answerspredicted_answerscontext_relevancefaithfulnesssas
0What are the two main tasks BERT is pre-traine...[Document(id=1996eb783b7e2934527de00e3d5f82fb5...Masked LM (MLM) and Next Sentence Prediction (...The two main tasks BERT is pre-trained on are ...01.0000000.552495
1What model sizes are reported for BERT, and wh...[Document(id=8906a653a71ec55161d5f8c6203335456...BERTBASE (L=12, H=768, A=12, Total Parameters=...The BERT model sizes reported are:\\n\\n1. **BER...00.0000000.664142
2How does BERT's architecture facilitate the us...[Document(id=320d3c00ef93938ee6cc92f6a742ba1ed...BERT uses a multi-layer bidirectional Transfor...BERT's architecture facilitates the use of a u...01.0000000.817575
3Can you describe the modifications LLaMA makes...[Document(id=f360dea1ec15f8f778718ae1e13eb855b...LLaMA incorporates pre-normalization (using R...None00.0000000.015276
4How does LLaMA's approach to embedding layer o...[Document(id=f360dea1ec15f8f778718ae1e13eb855b...LLaMA introduces optimizations in its embeddin...None00.0000000.075397
5How were the questions for the multitask test ...[Document(id=9415e713cf73ffea5ca383126c54f7ec4...Questions were manually collected by graduate ...The questions for the multitask test were manu...01.0000000.652526
6How does BERT's performance on the GLUE benchm...[Document(id=606c67eb5eeb136ad77616d2ef06a580b...BERT achieved new state-of-the-art on the GLUE...BERT significantly outperforms all previous st...00.8333330.857448
7What significant improvements does BERT bring ...[Document(id=4ca8419f5c01c094bbda9617b3ce328cb...BERT set new records on SQuAD v1.1 and v2.0, s...BERT brings substantial improvements to the SQ...01.0000000.586361
8What unique aspect of the LLaMA training datas...[Document(id=236e5c1e3c782e68912426a7f2543710c...LLaMA's training dataset is distinctive for b...The unique aspect of the LLaMA training datase...00.6666670.962779
9What detailed methodology does LLaMA utilize t...[Document(id=9885fbffa74c564acd7a255e8b66a3343...LLaMA's methodology for ensuring data diversit...None00.000000-0.005470
10What are the specific domains covered by the m...[Document(id=9415e713cf73ffea5ca383126c54f7ec4...The test covers 57 subjects across STEM, human...The specific domains covered by the multitask ...11.0000000.620999
11What specific enhancements are recommended for...[Document(id=ac7c3c2e29e31cf47dc1027f7d31ea94d...Enhancements should focus on developing models...The context does not provide specific enhancem...01.0000000.370423
12What methodology does DetectGPT use to generat...[Document(id=a862c889a8c02afa59e422bc2cbeb2425...DetectGPT generates minor perturbations using ...DetectGPT generates minor perturbations in the...10.6666670.734830
13Discuss the significance of DetectGPT's detect...[Document(id=ef8ff80b74a24f6cec05be8135930ba1b...DtectGPT's approach is significant as it provi...DetectGPT's detection approach is significant ...01.0000000.508008
14How is the student model, DistilBERT, initiali...[Document(id=33d936e116b7764ce538130aaa40c7b37...DistilBERT is initialized from the teacher mod...The student model, DistilBERT, is initialized ...10.0000000.778503
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " questions \\\n", + "0 What are the two main tasks BERT is pre-traine... \n", + "1 What model sizes are reported for BERT, and wh... \n", + "2 How does BERT's architecture facilitate the us... \n", + "3 Can you describe the modifications LLaMA makes... \n", + "4 How does LLaMA's approach to embedding layer o... \n", + "5 How were the questions for the multitask test ... \n", + "6 How does BERT's performance on the GLUE benchm... \n", + "7 What significant improvements does BERT bring ... \n", + "8 What unique aspect of the LLaMA training datas... \n", + "9 What detailed methodology does LLaMA utilize t... \n", + "10 What are the specific domains covered by the m... \n", + "11 What specific enhancements are recommended for... \n", + "12 What methodology does DetectGPT use to generat... \n", + "13 Discuss the significance of DetectGPT's detect... \n", + "14 How is the student model, DistilBERT, initiali... \n", + "\n", + " contexts \\\n", + "0 [Document(id=1996eb783b7e2934527de00e3d5f82fb5... \n", + "1 [Document(id=8906a653a71ec55161d5f8c6203335456... \n", + "2 [Document(id=320d3c00ef93938ee6cc92f6a742ba1ed... \n", + "3 [Document(id=f360dea1ec15f8f778718ae1e13eb855b... \n", + "4 [Document(id=f360dea1ec15f8f778718ae1e13eb855b... \n", + "5 [Document(id=9415e713cf73ffea5ca383126c54f7ec4... \n", + "6 [Document(id=606c67eb5eeb136ad77616d2ef06a580b... \n", + "7 [Document(id=4ca8419f5c01c094bbda9617b3ce328cb... \n", + "8 [Document(id=236e5c1e3c782e68912426a7f2543710c... \n", + "9 [Document(id=9885fbffa74c564acd7a255e8b66a3343... \n", + "10 [Document(id=9415e713cf73ffea5ca383126c54f7ec4... \n", + "11 [Document(id=ac7c3c2e29e31cf47dc1027f7d31ea94d... \n", + "12 [Document(id=a862c889a8c02afa59e422bc2cbeb2425... \n", + "13 [Document(id=ef8ff80b74a24f6cec05be8135930ba1b... \n", + "14 [Document(id=33d936e116b7764ce538130aaa40c7b37... \n", + "\n", + " true_answers \\\n", + "0 Masked LM (MLM) and Next Sentence Prediction (... \n", + "1 BERTBASE (L=12, H=768, A=12, Total Parameters=... \n", + "2 BERT uses a multi-layer bidirectional Transfor... \n", + "3 LLaMA incorporates pre-normalization (using R... \n", + "4 LLaMA introduces optimizations in its embeddin... \n", + "5 Questions were manually collected by graduate ... \n", + "6 BERT achieved new state-of-the-art on the GLUE... \n", + "7 BERT set new records on SQuAD v1.1 and v2.0, s... \n", + "8 LLaMA's training dataset is distinctive for b... \n", + "9 LLaMA's methodology for ensuring data diversit... \n", + "10 The test covers 57 subjects across STEM, human... \n", + "11 Enhancements should focus on developing models... \n", + "12 DetectGPT generates minor perturbations using ... \n", + "13 DtectGPT's approach is significant as it provi... \n", + "14 DistilBERT is initialized from the teacher mod... \n", + "\n", + " predicted_answers context_relevance \\\n", + "0 The two main tasks BERT is pre-trained on are ... 0 \n", + "1 The BERT model sizes reported are:\\n\\n1. **BER... 0 \n", + "2 BERT's architecture facilitates the use of a u... 0 \n", + "3 None 0 \n", + "4 None 0 \n", + "5 The questions for the multitask test were manu... 0 \n", + "6 BERT significantly outperforms all previous st... 0 \n", + "7 BERT brings substantial improvements to the SQ... 0 \n", + "8 The unique aspect of the LLaMA training datase... 0 \n", + "9 None 0 \n", + "10 The specific domains covered by the multitask ... 1 \n", + "11 The context does not provide specific enhancem... 0 \n", + "12 DetectGPT generates minor perturbations in the... 1 \n", + "13 DetectGPT's detection approach is significant ... 0 \n", + "14 The student model, DistilBERT, is initialized ... 1 \n", + "\n", + " faithfulness sas \n", + "0 1.000000 0.552495 \n", + "1 0.000000 0.664142 \n", + "2 1.000000 0.817575 \n", + "3 0.000000 0.015276 \n", + "4 0.000000 0.075397 \n", + "5 1.000000 0.652526 \n", + "6 0.833333 0.857448 \n", + "7 1.000000 0.586361 \n", + "8 0.666667 0.962779 \n", + "9 0.000000 -0.005470 \n", + "10 1.000000 0.620999 \n", + "11 1.000000 0.370423 \n", + "12 0.666667 0.734830 \n", + "13 1.000000 0.508008 \n", + "14 0.000000 0.778503 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_results.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "dFrIfDXiaP41", + "outputId": "4f187782-7933-4bc0-89b9-51c6672c20ec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 \n", + "Question: How does BERT's architecture facilitate the use of a unified model across diverse NLP tasks? \n", + "True Answer: BERT uses a multi-layer bidirectional Transformer encoder architecture, allowing for minimal task-specific architecture modifications in fine-tuning. \n", + "Answer: BERT's architecture facilitates the use of a unified model across diverse NLP tasks through its design as a multi-layer bidirectional Transformer encoder. This architecture allows for minimal differences between the pre-trained model and the final downstream model architecture. By using a consistent approach to both pre-training and fine-tuning, BERT can adapt to various tasks with only a simple classification layer added on top. Additionally, BERT's capability to jointly condition on both left and right context in all layers enhances its versatility across different natural language processing tasks, thereby enabling state-of-the-art performances without substantial task-specific modifications.\n", + "BERT: Pre-training of Deep Bidirectional Transformers for\n", + "Language Understanding\n", + "Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\n", + "Google AI Language\n", + "{jacobdevlin,mingweichang,kentonl,kristout}@google.com\n", + "Abstract\n", + "We introduce a new language representa-\n", + "tion model called BERT, which stands for\n", + "Bidirectional Encoder Representations from\n", + "Transformers. Unlike recent language repre-\n", + "sentation models (Peters et al., 2018a; Rad-\n", + "ford et al., 2018), BERT is designed to pre-\n", + "train deep bidirectional representations from\n", + "unlabeled text by jointly conditioning on both\n", + "left and right context in all layers. As a re-\n", + "sult, the pre-trained BERT model can be fine-\n", + "tuned with just one additional output layer\n", + "to create state-of-the-art models for a wide\n", + "range of tasks, such as question answering and\n", + "language inference, without substantial task-\n", + "specific architecture modifications.\n", + "BERT is conceptually simple and empirically\n", + "powerful. It obtains new state-of-the-art re-\n", + "sults on eleven natural language processing\n", + "tasks, including pushing the GLUE score to\n", + "80.5% (7.7% point absolute improvement),\n", + "MultiNLI accuracy to 86.7% (4.6% absolute\n", + "improvement), SQuAD v1.1 question answer-\n", + "ing Test F1 to 93.2 (1.5 point absolute im-\n", + "provement) and SQuAD v2.0 Test F1 to 83.1\n", + "(5.1 point absolute improvement).\n", + "1 Introduction\n", + "Language model pre-training has been shown to\n", + "be effective for improving many natural language\n", + "processing tasks (Dai and Le, 2015; Peters et al.,\n", + "2018a; Radford et al., 2018; Howard and Ruder,\n", + "2018). These include sentence-level tasks such as\n", + "natural language inference (Bowman et al., 2015;\n", + "Williams et al., 2018) and paraphrasing (Dolan\n", + "and Brockett, 2005), which aim to predict the re-\n", + "lationships between sentences by analyzing them\n", + "holistically, as well as token-level tasks such as\n", + "named entity recognition and question answering,\n", + "where models are required to produce fine-grained\n", + "output at the token level (Tjong Kim Sang and\n", + "De Meulder, 2003; Rajpurkar et al., 2016).\n", + "There are two existing strategies for apply-\n", + "ing pre-trained when\n", + "downstream task data is very small.\n", + "5.3 Feature-based Approach with BERT\n", + "All of the BERT results presented so far have used\n", + "the fine-tuning approach, where a simple classifi-\n", + "cation layer is added to the pre-trained model, and\n", + "all parameters are jointly fine-tuned on a down-\n", + "stream task. However, the feature-based approach,\n", + "where fixed features are extracted from the pre-\n", + "trained model, has certain advantages. First, not\n", + "all tasks can be easily represented by a Trans-\n", + "former encoder architecture, and therefore require\n", + "a task-specific model architecture to be added.\n", + "Second, there are major computational benefits\n", + "to pre-compute an expensive representation of the\n", + "training data once and then run many experiments\n", + "with cheaper models on top of this representation.\n", + "In this section, we compare the two approaches\n", + "by applying BERT to the CoNLL-2003 Named\n", + "Entity Recognition (NER) task (Tjong Kim Sang\n", + "and De Meulder, 2003). In the input to BERT, we\n", + "use a case-preserving WordPiece model, and we\n", + "include the maximal document context provided\n", + "by the data. Following standard practice, we for-\n", + "mulate this as a tagging task but do not use a CRF\n", + "Hyperparams Dev Set Accuracy\n", + "#L #H #A LM (ppl) MNLI-m MRPC SST-2\n", + "3 768 12 5.84 77.9 79.8 88.4\n", + "6 768 3 5.24 80.6 82.2 90.7\n", + "6 768 12 4.68 81.9 84.8 91.3\n", + "12 768 12 3.99 84.4 86.7 92.9\n", + "12 1024 16 3.54 85.7 86.9 93.3\n", + "24 1024 16 3.23 86.6 87.8 93.7\n", + "Table 6: Ablation over BERT model size. #L = the\n", + "number of layers; #H = hidden size; #A = number of at-\n", + "tention heads. β€œLM (ppl)” is the masked LM perplexity\n", + "of held-out training data.\n", + "System Dev F1 Test F1\n", + "ELMo (Peters et al., 2018a) 95.7 92.2\n", + "CVT (Clark et al., 2018) C.2), BERT\n", + "performs (91.0 F1) comparably to our human baseline (94.9 F1). Given this small margin, we also\n", + "exclude GAP.\n", + "On Discovering Ongoing Conversations, our BERT baseline achieves an F1 of 51.9 on a version of\n", + "the task cast as sentence pair classification (given two snippets of texts from plays, determine if the\n", + "second snippet is a continuation of the first). This dataset is very class imbalanced (90% negative), so\n", + "we also experimented with a class-balanced version on which our BERT baselines achieves 88.4\n", + "F1. Qualitatively, we also found the task challenging for humans as there was little context for the\n", + "text snippets and the examples were drawn from plays using early English. Given this fairly high\n", + "machine performance and challenging nature for humans, we exclude this task from our benchmark.\n", + "Instructions tables begin on the following page.\n", + "10https://www.kaggle.com/c/quora-insincere-questions-classification/data\n", + "18\fTable 5: The instructions given to crowd-sourced worker describing the training phase for the Choice\n", + "of Plausible Answers (COPA) task.\n", + "The New York University Center for Data Science is collecting your answers for use in research\n", + "on computer understanding of English. Thank you for your help!\n", + "This project is a training task that needs to be completed before working on the main project\n", + "on AMT named Human Performance: Plausible Answer. Once you are done with the training,\n", + "please proceed to the main task! The qualification approval is not immediate but we will add\n", + "you to our qualified workers list within a day.\n", + "In this training, you must answer the question on the page and then, to see how you did, click\n", + "the Check Work button at the bottom of the page before pre-trained with Ima-\n", + "geNet (Deng et al., 2009; Yosinski et al., 2014).\n", + "3 BERT\n", + "We introduce BERT and its detailed implementa-\n", + "tion in this section. There are two steps in our\n", + "framework: pre-training and fine-tuning. Dur-\n", + "ing pre-training, the model is trained on unlabeled\n", + "data over different pre-training tasks. For fine-\n", + "tuning, the BERT model is first initialized with\n", + "the pre-trained parameters, and all of the param-\n", + "eters are fine-tuned using labeled data from the\n", + "downstream tasks. Each downstream task has sep-\n", + "arate fine-tuned models, even though they are ini-\n", + "tialized with the same pre-trained parameters. The\n", + "question-answering example in Figure 1 will serve\n", + "as a running example for this section.\n", + "A distinctive feature of BERT is its unified ar-\n", + "chitecture across different tasks. There is mini-\n", + "mal difference between the pre-trained architec-\n", + "ture and the final downstream architecture.\n", + "Model Architecture BERT’s model architec-\n", + "ture is a multi-layer bidirectional Transformer en-\n", + "coder based on the original implementation de-\n", + "scribed in Vaswani et al. (2017) and released in\n", + "the tensor2tensor library.1 Because the use\n", + "of Transformers has become common and our im-\n", + "plementation is almost identical to the original,\n", + "we will omit an exhaustive background descrip-\n", + "tion of the model architecture and refer readers to\n", + "Vaswani et al. (2017) as well as excellent guides\n", + "such as β€œThe Annotated Transformer.”2\n", + "In this work, we denote the number of layers\n", + "(i.e., Transformer blocks) as L, the hidden size as\n", + "H, and the number of self-attention heads as A.3\n", + "We primarily report results on two model sizes:\n", + "BERTBASE (L=12, H=768, A=12, Total Param-\n", + "eters=110M) and BERTLARGE (L=24, H=1024,\n", + "A=16, Total Parameters=340M).\n", + "BERTBASE was chosen to have the same model\n", + "size as OpenAI GPT for comparison purposes.\n", + "Critically, however, the BERT Transformer uses\n", + "bidirectional self-attention, while using the\n", + "default learning rate of (Radford et al., 2018). Im-\n", + "portantly, we generate randomly according to the\n", + "language model distribution, rather than perform-\n", + "ing beam search – this would bias the genera-\n", + "tions towards common words. For the WikiHow\n", + "endings, we used Nucleus Sampling with p β€œ\n", + "0.98, which means that the probability weights for\n", + "the tail (those tokens with cumulative probabil-\n", + "ity mass Δƒ0.02) are zeroed out (Holtzman et al.,\n", + "2019).\n", + "C BERT setup\n", + "We extensively study BERT in this paper, and\n", + "make no changes to the underlying architecture or\n", + "pretraining. For all of the experiments where we\n", + "provide context, we set up the input to the BERT\n", + "model like this:\n", + "[CLS] A woman is outside with a bucket and\n", + "a dog. The dog is running around trying to\n", + "avoid a bath. [SEP] She gets the dog wet,\n", + "then it runs away again [SEP]\n", + "In the case where only the ending is pro-\n", + "vided, we adopt the BERT-style β€˜single-span’ set-\n", + "ting: [CLS] She gets the dog wet, then it runs\n", + "away again [SEP]\n", + "D A discussion on BERT\n", + "Hyperparameters and Instability\n", + "It is worth noting that many of our experiments\n", + "some instability. On the SW AG experiments, we\n", + "use the same hyperparameters as (Devlin et al.,\n", + "2018) - these generally work very well. 13 How-\n", + "ever, we find that they become a bit unstable when\n", + "crossing over to make HellaSwag. Here, we dis-\n", + "cuss some strategies and insight that we picked up\n", + "on.\n", + "a. We use a batch size of 64 examples rather\n", + "than 16, and warm the model up for 20% of\n", + "the dataset (rather than 10%). This helps the\n", + "model adapt to SW AG more gradually, with-\n", + "out diverging early on.\n", + "b. For the Adversarial Filtering et al.,\n", + "2018). Given the productive use of MultiNLI in pretraining and intermediate fine-tuning of pretrained\n", + "language models (Conneau et al., 2017; Phang et al., 2018, i.a.), for CB, RTE, and BoolQ, we use\n", + "MultiNLI as a transfer task by first using the above procedure on MultiNLI. Similarly, given the\n", + "similarity of COPA to SW AG (Zellers et al., 2018), we first fine-tune BERT on SW AG. These results\n", + "are reported as BERT++. For all other tasks, we reuse the results of BERT fine-tuned on just that task.\n", + "Other Baselines We include a baseline where for each task we simply predict the majority class,6\n", + "as well as a bag-of-words baseline where each input is represented as an average of its tokens’ GloVe\n", + "word vectors (the 300D/840B release from Pennington et al., 2014). Finally, we list the best known\n", + "result on each task as of May 2019, except on tasks which we recast (WSC), resplit (CB), or achieve\n", + "6For ReCoRD, we predict the entity that has the highest F1 with the other entity options.\n", + "8\fthe best known result (WiC). The outside results for COPA, MultiRC, and RTE are from Sap et al.\n", + "(2019), Trivedi et al. (2019), and Liu et al. (2019d) respectively.\n", + "Human Performance Pilehvar and Camacho-Collados (2019), Khashabi et al. (2018), Nangia and\n", + "Bowman (2019), and Zhang et al. (2018) respectively provide estimates for human performance\n", + "on WiC, MultiRC, RTE, and ReCoRD. For the remaining tasks, including the diagnostic set, we\n", + "estimate human performance by hiring crowdworker annotators through Amazon’s Mechanical Turk\n", + "platform to reannotate a sample of each test set. We follow a two step the two methods could be\n", + "minimally compared. The core argument of this\n", + "work is that the bi-directionality and the two pre-\n", + "training tasks presented in Section 3.1 account for\n", + "the majority of the empirical improvements, but\n", + "we do note that there are several other differences\n", + "between how BERT and GPT were trained:\n", + "β€’ GPT is trained on the BooksCorpus (800M\n", + "words); BERT is trained on the BooksCor-\n", + "pus (800M words) and Wikipedia (2,500M\n", + "words).\n", + "β€’ GPT uses a sentence separator ( [SEP]) and\n", + "classifier token ( [CLS]) which are only in-\n", + "troduced at fine-tuning time; BERT learns\n", + "[SEP], [CLS] and sentence A/B embed-\n", + "dings during pre-training.\n", + "β€’ GPT was trained for 1M steps with a batch\n", + "size of 32,000 words; BERT was trained for\n", + "1M steps with a batch size of 128,000 words.\n", + "β€’ GPT used the same learning rate of 5e-5 for\n", + "all fine-tuning experiments; BERT chooses a\n", + "task-specific fine-tuning learning rate which\n", + "performs the best on the development set.\n", + "To isolate the effect of these differences, we per-\n", + "form ablation experiments in Section 5.1 which\n", + "demonstrate that the majority of the improvements\n", + "are in fact coming from the two pre-training tasks\n", + "and the bidirectionality they enable.\n", + "A.5 Illustrations of Fine-tuning on Different\n", + "Tasks\n", + "The illustration of fine-tuning BERT on different\n", + "tasks can be seen in Figure 4. Our task-specific\n", + "models are formed by incorporating BERT with\n", + "one additional output layer, so a minimal num-\n", + "ber of parameters need to be learned from scratch.\n", + "Among the tasks, (a) and (b) are sequence-level\n", + "tasks while (c) and (d) are token-level tasks. In\n", + "the figure, E represents the input embedding, Ti\n", + "represents the contextual representation of tokeni,\n", + "[CLS] is the special symbol for classification out-\n", + "put, and [SEP] is the special Encoder and Decoder. However, recent\n", + "work leveraging transformers for language modeling such as\n", + "BERT (Devlin et al., 2018) and GPT-2 (Radford et al., 2019)\n", + "use only the Encoder or Decoder depending on their needs.\n", + "This work explores both a decoder architecture, GPT-2, and\n", + "an encoder architecture, BERT.\n", + "Figure 2 shows a schematic diagram of the model we used.\n", + "We refer the reader to prior work for a detailed descrip-\n", + "tion of the model architecture (Vaswani et al., 2017; Devlin\n", + "et al., 2018; Radford et al., 2019). It is worthwhile to men-\n", + "tion that both GPT-2 and BERT use GeLU (Hendrycks &\n", + "Gimpel, 2016) nonlinearities and layer normalization (Ba\n", + "et al., 2016) to the input of the multi-head attention and feed\n", + "forward layers, whereas the original transformer (Vaswani\n", + "et al., 2017) uses ReLU nonlinearities and applies layer\n", + "normalization to outputs.\n", + "2.3. Data and Model Parallelism in Deep Learning\n", + "There are two central paradigms for scaling out deep neu-\n", + "ral network training to numerous hardware accelerators:\n", + "data parallelism (Valiant, 1990) where a training minibatch\n", + "is split across multiple workers, and model parallelism in\n", + "which the memory usage and computation of a model is\n", + "distributed across multiple workers. By increasing the mini-\n", + "batch size proportionally to the number of available work-\n", + "ers (i.e. weak scaling), one observes near linear scaling\n", + "in training data throughput. However, large batch train-\n", + "ing introduces complications into the optimization process\n", + "that can result in reduced accuracy or longer time to conver-\n", + "gence, offsetting the benefit of increased training throughput\n", + "(Keskar et al., 2017). Further research (Goyal et al., 2017;\n", + "You et al., 2017; 2019) has developed techniques to miti-\n", + "gate these effects and drive down the training time of fine-tuning data shuffling and clas-\n", + "sifier layer initialization.9\n", + "Results are presented in Table 1. Both\n", + "BERTBASE and BERTLARGE outperform all sys-\n", + "tems on all tasks by a substantial margin, obtaining\n", + "4.5% and 7.0% respective average accuracy im-\n", + "provement over the prior state of the art. Note that\n", + "BERTBASE and OpenAI GPT are nearly identical\n", + "in terms of model architecture apart from the at-\n", + "tention masking. For the largest and most widely\n", + "reported GLUE task, MNLI, BERT obtains a 4.6%\n", + "absolute accuracy improvement. On the official\n", + "GLUE leaderboard10, BERTLARGE obtains a score\n", + "of 80.5, compared to OpenAI GPT, which obtains\n", + "72.8 as of the date of writing.\n", + "We find that BERT LARGE significantly outper-\n", + "forms BERTBASE across all tasks, especially those\n", + "with very little training data. The effect of model\n", + "size is explored more thoroughly in Section 5.2.\n", + "4.2 SQuAD v1.1\n", + "The Stanford Question Answering Dataset\n", + "(SQuAD v1.1) is a collection of 100k crowd-\n", + "sourced question/answer pairs (Rajpurkar et al.,\n", + "2016). Given a question and a passage from\n", + "9The GLUE data set distribution does not include the Test\n", + "labels, and we only made a single GLUE evaluation server\n", + "submission for each of BERTBASE and BERTLARGE .\n", + "10https://gluebenchmark.com/leaderboard\n", + "Wikipedia containing the answer, the task is to\n", + "predict the answer text span in the passage.\n", + "As shown in Figure 1, in the question answer-\n", + "ing task, we represent the input question and pas-\n", + "sage as a single packed sequence, with the ques-\n", + "tion using the A embedding and the passage using\n", + "the B embedding. We only introduce a start vec-\n", + "tor S ∈RH and an end vector E ∈RH during\n", + "fine-tuning. The probability of word i being the\n", + "start of the answer span is computed as a dot prod-\n", + "uct between the GPT Trans-\n", + "former uses constrained self-attention where every\n", + "token can only attend to context to its left.4\n", + "1https://github.com/tensorflow/tensor2tensor\n", + "2http://nlp.seas.harvard.edu/2018/04/03/attention.html\n", + "3In all cases we set the feed-forward/filter size to be 4H,\n", + "i.e., 3072 for the H = 768and 4096 for the H = 1024.\n", + "4We note that in the literature the bidirectional Trans-\fInput/Output Representations To make BERT\n", + "handle a variety of down-stream tasks, our input\n", + "representation is able to unambiguously represent\n", + "both a single sentence and a pair of sentences\n", + "(e.g., ⟨Question, Answer ⟩) in one token sequence.\n", + "Throughout this work, a β€œsentence” can be an arbi-\n", + "trary span of contiguous text, rather than an actual\n", + "linguistic sentence. A β€œsequence” refers to the in-\n", + "put token sequence to BERT, which may be a sin-\n", + "gle sentence or two sentences packed together.\n", + "We use WordPiece embeddings (Wu et al.,\n", + "2016) with a 30,000 token vocabulary. The first\n", + "token of every sequence is always a special clas-\n", + "sification token ( [CLS]). The final hidden state\n", + "corresponding to this token is used as the ag-\n", + "gregate sequence representation for classification\n", + "tasks. Sentence pairs are packed together into a\n", + "single sequence. We differentiate the sentences in\n", + "two ways. First, we separate them with a special\n", + "token ([SEP]). Second, we add a learned embed-\n", + "ding to every token indicating whether it belongs\n", + "to sentence A or sentence B. As shown in Figure 1,\n", + "we denote input embedding as E, the final hidden\n", + "vector of the special [CLS] token as C ∈RH,\n", + "and the final hidden vector for the ith input token\n", + "as Ti ∈RH.\n", + "For a given token, its input representation is\n", + "constructed by summing the corresponding token,\n", + "segment, and position embeddings. A visualiza-\n", + "tion of this construction can be seen \n" + ] + } + ], + "source": [ + "index = 2\n", + "print(eval_pipeline_results['context_relevance'][\"individual_scores\"][index], \"\\nQuestion:\", questions[index],\"\\nTrue Answer:\", answers[index], \"\\nAnswer:\", predicted_answers[index])\n", + "print(\"\".join([doc.content for doc in retrieved_context[index]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OmkHqAsQZhFr" + }, + "source": [ + "## Evaluation Harness (Step 4, 5, and 6)\n", + "\n", + "* Runs the RAG pipeline\n", + "* Runs the evaluation\n", + "\n", + "> Try `EvaluationHarness` and give us feedback [on Github](https://github.com/deepset-ai/haystack-experimental/discussions/74)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "IQlMUUOeZkpT" + }, + "outputs": [], + "source": [ + "from haystack_experimental.evaluation.harness.rag import (\n", + " DefaultRAGArchitecture,\n", + " RAGEvaluationHarness,\n", + " RAGEvaluationMetric,\n", + " RAGEvaluationInput\n", + ")\n", + "\n", + "pipeline_eval_harness = RAGEvaluationHarness(\n", + " rag_pipeline = basic_rag,\n", + " rag_components=DefaultRAGArchitecture.GENERATION_WITH_EMBEDDING_RETRIEVAL, # query_embedder, retriever, prompt_builder, generator\n", + " metrics={\n", + " RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,\n", + " RAGEvaluationMetric.FAITHFULNESS,\n", + " RAGEvaluationMetric.CONTEXT_RELEVANCE,\n", + " }\n", + ")\n", + "\n", + "eval_harness_input = RAGEvaluationInput(\n", + " queries=questions,\n", + " ground_truth_answers=answers,\n", + " rag_pipeline_inputs={\n", + " \"prompt_builder\": {\"question\": list(questions)},\n", + " },\n", + ")\n", + "\n", + "harness_eval_run= pipeline_eval_harness.run(inputs=eval_harness_input, run_name=run_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "collapsed": true, + "id": "edZNrdB9sKwm", + "outputId": "b4d8ad56-578e-4953-bdea-4c1d1d4f1a54" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"harness_eval_run\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"metrics\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"metric_context_relevance\",\n \"metric_sas\",\n \"metric_faithfulness\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.24119916658007876,\n \"min\": 0.26666666666666666,\n \"max\": 0.7477777777777778,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.26666666666666666,\n 0.5377212050060431,\n 0.7477777777777778\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0metric_context_relevance0.266667
1metric_sas0.537721
2metric_faithfulness0.747778
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " metrics score\n", + "0 metric_context_relevance 0.266667\n", + "1 metric_sas 0.537721\n", + "2 metric_faithfulness 0.747778" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness_eval_run.results.score_report()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0EivA58Ck31n" + }, + "source": [ + "Override some parameter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "KhDT1y3Bk3dG" + }, + "outputs": [], + "source": [ + "from haystack_experimental.evaluation.harness.rag import RAGEvaluationOverrides\n", + "\n", + "overrides = RAGEvaluationOverrides(rag_pipeline={\n", + " \"generator\": {\"model\": \"gpt-4\"},\n", + "})\n", + "\n", + "harness_eval_run_gpt4 = pipeline_eval_harness.run(inputs=eval_harness_input, run_name=\"harness_eval_run_gpt4\", overrides=overrides)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "collapsed": true, + "id": "ZEtHeWXkwDeH", + "outputId": "b115a270-7fbb-4e29-e564-e4708a329b80" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"harness_eval_run_gpt4\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"metrics\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"metric_context_relevance\",\n \"metric_sas\",\n \"metric_faithfulness\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2741642634740533,\n \"min\": 0.26666666666666666,\n \"max\": 0.7964285714285714,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.26666666666666666,\n 0.6540726095438003,\n 0.7964285714285714\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0metric_context_relevance0.266667
1metric_sas0.654073
2metric_faithfulness0.796429
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " metrics score\n", + "0 metric_context_relevance 0.266667\n", + "1 metric_sas 0.654073\n", + "2 metric_faithfulness 0.796429" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness_eval_run_gpt4.results.score_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "3NVSvHc28TbS", + "outputId": "acd217d6-32c3-49db-a648-00c3b969fba3" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"harness_eval_run\",\n \"rows\": 15,\n \"fields\": [\n {\n \"column\": \"questions\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"What detailed methodology does LLaMA utilize to ensure the diversity of its pre-training data, particularly in the context of filtering and language identification?\",\n \"What specific enhancements are recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing?\",\n \"What are the two main tasks BERT is pre-trained on?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"contexts\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"responses\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"DetectGPT's detection approach is significant within the context of evolving large language models (LLMs) due to its ability to effectively identify machine-generated text against the backdrop of increasingly sophisticated AI-generated content. As LLMs, such as GPT-3 and others, continue to improve in their ability to produce coherent and articulate text, the challenge of distinguishing between human-written and machine-generated content becomes more pronounced. This is particularly concerning in applications such as education and journalism, where the integrity of written material is paramount.\\n\\nDetectGPT leverages the hypothesis that machine-generated text often occupies areas of negative curvature within a model\\u2019s log probability function, allowing it to distinguish between human and AI outputs based on how perturbations to the text affect its likelihood score. This methodology not only enhances the accuracy of machine-generated text detection but also highlights the inherent differences in how models generate content compared to human authors. \\n\\nThe potential for misuse of LLMs amplifies the importance of a robust detection mechanism like DetectGPT. As AI continues to be adopted for generating news articles, essays, and even creative content, there is a growing risk of misinformation and academic dishonesty. DetectGPT provides educators, journalists, and content consumers with a valuable tool to evaluate the authenticity of text, thereby empowering them to address issues related to AI-generated content's proliferation, which can include factual inaccuracies and ethical dilemmas associated with authorship and originality.\\n\\nMoreover, as LLM capabilities evolve, the methods and tactics for evading detection, such as paraphrasing or using alternative decoding strategies, are likely to emerge. The findings of DetectGPT emphasize the necessity for continuous advancements in detection technologies to keep pace with the innovations in content generation. This ongoing arms race between generation and detection underscores the critical role that tools like DetectGPT will play in maintaining the trustworthiness of written content in a rapidly changing digital landscape.\",\n \"The context does not provide specific enhancements recommended for language models to bridge the knowledge application gap identified through comprehensive multitask testing. It discusses the shortcomings of current models, such as their lopsided performance and difficulties with socially relevant subjects, but does not outline particular recommendations for improvements. Therefore, the answer is:\\n\\nNone\",\n \"The two main tasks BERT is pre-trained on are the masked language model (MLM) and next sentence prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ground_truth_answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"LLaMA's methodology for ensuring data diversity includes comprehensive filtering criteria to remove low-quality content, language identification to support multilingual capabilities, and a balanced inclusion of various data sources such as Wikipedia, web text, and books, emphasizing the representation of a wide range of topics and languages.\",\n \"Enhancements should focus on developing models with improved procedural knowledge, superior calculation abilities, and a more accurate calibration between confidence and actual performance, directly addressing the weaknesses uncovered in current evaluations.\",\n \"Masked LM (MLM) and Next Sentence Prediction (NSP).\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.30258921111168324,\n \"min\": -0.005470070987939835,\n \"max\": 0.8942041397094727,\n \"num_unique_values\": 15,\n \"samples\": [\n -0.005470070987939835,\n 0.31024250388145447\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rag_eval_metric_faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40172577448254165,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.0,\n 0.75\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_context_relevance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_sas\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1794187585093937,\n \"min\": 0.22081966698169708,\n \"max\": 0.9491991400718689,\n \"num_unique_values\": 15,\n \"samples\": [\n 0.6814706325531006,\n 0.45860764384269714\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"harness_eval_run_gpt4_metric_faithfulness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.36557644484599805,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.8571428571428571,\n 0.7142857142857143\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionscontextsresponsesground_truth_answersrag_eval_metric_context_relevancerag_eval_metric_sasrag_eval_metric_faithfulnessharness_eval_run_gpt4_metric_context_relevanceharness_eval_run_gpt4_metric_sasharness_eval_run_gpt4_metric_faithfulness
0What are the two main tasks BERT is pre-traine...[pre-trained with Ima-\\ngeNet (Deng et al., 20...The two main tasks BERT is pre-trained on are ...Masked LM (MLM) and Next Sentence Prediction (...00.5935951.00000000.2208201.000000
1What model sizes are reported for BERT, and wh...[the\\ntraining loss for 336M and 752M BERT mod...The model sizes reported for BERT and their sp...BERTBASE (L=12, H=768, A=12, Total Parameters=...00.6264801.00000000.7621671.000000
2How does BERT's architecture facilitate the us...[BERT: Pre-training of Deep Bidirectional Tran...BERT's architecture facilitates the use of a u...BERT uses a multi-layer bidirectional Transfor...10.8782121.00000010.6972501.000000
3Can you describe the modifications LLaMA makes...[to the transformer\\narchitecture (Vaswani et ...NoneLLaMA incorporates pre-normalization (using R...00.0152760.00000000.5639440.857143
4How does LLaMA's approach to embedding layer o...[to the transformer\\narchitecture (Vaswani et ...NoneLLaMA introduces optimizations in its embeddin...00.0753970.00000000.6261731.000000
5How were the questions for the multitask test ...[of subjects that either do not neatly fit into...The questions for the multitask test were manu...Questions were manually collected by graduate ...00.6399050.80000000.6118381.000000
6How does BERT's performance on the GLUE benchm...[GLUE provides a lightweight classification API...BERT significantly outperforms previous state-...BERT achieved new state-of-the-art on the GLUE...00.8088571.00000000.8531331.000000
7What significant improvements does BERT bring ...[fine-tuning data shuffling and clas-\\nsifier lay...BERT brings significant improvements to the SQ...BERT set new records on SQuAD v1.1 and v2.0, s...00.6531011.00000000.6621450.375000
8What unique aspect of the LLaMA training datas...[model, Gopher, has worse\\nperformance than Ch...LLaMA was trained exclusively on publicly avai...LLaMA's training dataset is distinctive for b...00.8942041.00000000.9491991.000000
9What detailed methodology does LLaMA utilize t...[the description and satisfies the\\ntest cases....NoneLLaMA's methodology for ensuring data diversit...0-0.0054700.00000000.6814710.000000
10What are the specific domains covered by the m...[of subjects that either do not neatly fit into...The specific domains covered by the multitask ...The test covers 57 subjects across STEM, human...10.5819560.66666710.5324570.714286
11What specific enhancements are recommended for...[Published as a conference paper at ICLR 2021\\...The context does not provide specific enhancem...Enhancements should focus on developing models...00.3102431.00000000.4586081.000000
12What methodology does DetectGPT use to generat...[of the data distribution on DetectGPT, partic...DetectGPT generates minor perturbations in the...DetectGPT generates minor perturbations using ...10.7803531.00000010.8222071.000000
13Discuss the significance of DetectGPT's detect...[different from the\\nsource model, detection p...DetectGPT's detection approach is significant ...DtectGPT's approach is significant as it provi...00.4913601.00000000.5664471.000000
14How is the student model, DistilBERT, initiali...[works focus on building task-specific distilla...The student model, DistilBERT, is initialized ...DistilBERT is initialized from the teacher mod...10.7223490.75000010.8032310.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " questions \\\n", + "0 What are the two main tasks BERT is pre-traine... \n", + "1 What model sizes are reported for BERT, and wh... \n", + "2 How does BERT's architecture facilitate the us... \n", + "3 Can you describe the modifications LLaMA makes... \n", + "4 How does LLaMA's approach to embedding layer o... \n", + "5 How were the questions for the multitask test ... \n", + "6 How does BERT's performance on the GLUE benchm... \n", + "7 What significant improvements does BERT bring ... \n", + "8 What unique aspect of the LLaMA training datas... \n", + "9 What detailed methodology does LLaMA utilize t... \n", + "10 What are the specific domains covered by the m... \n", + "11 What specific enhancements are recommended for... \n", + "12 What methodology does DetectGPT use to generat... \n", + "13 Discuss the significance of DetectGPT's detect... \n", + "14 How is the student model, DistilBERT, initiali... \n", + "\n", + " contexts \\\n", + "0 [pre-trained with Ima-\\ngeNet (Deng et al., 20... \n", + "1 [the\\ntraining loss for 336M and 752M BERT mod... \n", + "2 [BERT: Pre-training of Deep Bidirectional Tran... \n", + "3 [to the transformer\\narchitecture (Vaswani et ... \n", + "4 [to the transformer\\narchitecture (Vaswani et ... \n", + "5 [of subjects that either do not neatly fit into... \n", + "6 [GLUE provides a lightweight classification API... \n", + "7 [fine-tuning data shuffling and clas-\\nsifier lay... \n", + "8 [model, Gopher, has worse\\nperformance than Ch... \n", + "9 [the description and satisfies the\\ntest cases.... \n", + "10 [of subjects that either do not neatly fit into... \n", + "11 [Published as a conference paper at ICLR 2021\\... \n", + "12 [of the data distribution on DetectGPT, partic... \n", + "13 [different from the\\nsource model, detection p... \n", + "14 [works focus on building task-specific distilla... \n", + "\n", + " responses \\\n", + "0 The two main tasks BERT is pre-trained on are ... \n", + "1 The model sizes reported for BERT and their sp... \n", + "2 BERT's architecture facilitates the use of a u... \n", + "3 None \n", + "4 None \n", + "5 The questions for the multitask test were manu... \n", + "6 BERT significantly outperforms previous state-... \n", + "7 BERT brings significant improvements to the SQ... \n", + "8 LLaMA was trained exclusively on publicly avai... \n", + "9 None \n", + "10 The specific domains covered by the multitask ... \n", + "11 The context does not provide specific enhancem... \n", + "12 DetectGPT generates minor perturbations in the... \n", + "13 DetectGPT's detection approach is significant ... \n", + "14 The student model, DistilBERT, is initialized ... \n", + "\n", + " ground_truth_answers \\\n", + "0 Masked LM (MLM) and Next Sentence Prediction (... \n", + "1 BERTBASE (L=12, H=768, A=12, Total Parameters=... \n", + "2 BERT uses a multi-layer bidirectional Transfor... \n", + "3 LLaMA incorporates pre-normalization (using R... \n", + "4 LLaMA introduces optimizations in its embeddin... \n", + "5 Questions were manually collected by graduate ... \n", + "6 BERT achieved new state-of-the-art on the GLUE... \n", + "7 BERT set new records on SQuAD v1.1 and v2.0, s... \n", + "8 LLaMA's training dataset is distinctive for b... \n", + "9 LLaMA's methodology for ensuring data diversit... \n", + "10 The test covers 57 subjects across STEM, human... \n", + "11 Enhancements should focus on developing models... \n", + "12 DetectGPT generates minor perturbations using ... \n", + "13 DtectGPT's approach is significant as it provi... \n", + "14 DistilBERT is initialized from the teacher mod... \n", + "\n", + " rag_eval_metric_context_relevance rag_eval_metric_sas \\\n", + "0 0 0.593595 \n", + "1 0 0.626480 \n", + "2 1 0.878212 \n", + "3 0 0.015276 \n", + "4 0 0.075397 \n", + "5 0 0.639905 \n", + "6 0 0.808857 \n", + "7 0 0.653101 \n", + "8 0 0.894204 \n", + "9 0 -0.005470 \n", + "10 1 0.581956 \n", + "11 0 0.310243 \n", + "12 1 0.780353 \n", + "13 0 0.491360 \n", + "14 1 0.722349 \n", + "\n", + " rag_eval_metric_faithfulness \\\n", + "0 1.000000 \n", + "1 1.000000 \n", + "2 1.000000 \n", + "3 0.000000 \n", + "4 0.000000 \n", + "5 0.800000 \n", + "6 1.000000 \n", + "7 1.000000 \n", + "8 1.000000 \n", + "9 0.000000 \n", + "10 0.666667 \n", + "11 1.000000 \n", + "12 1.000000 \n", + "13 1.000000 \n", + "14 0.750000 \n", + "\n", + " harness_eval_run_gpt4_metric_context_relevance \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 0 \n", + "10 1 \n", + "11 0 \n", + "12 1 \n", + "13 0 \n", + "14 1 \n", + "\n", + " harness_eval_run_gpt4_metric_sas \\\n", + "0 0.220820 \n", + "1 0.762167 \n", + "2 0.697250 \n", + "3 0.563944 \n", + "4 0.626173 \n", + "5 0.611838 \n", + "6 0.853133 \n", + "7 0.662145 \n", + "8 0.949199 \n", + "9 0.681471 \n", + "10 0.532457 \n", + "11 0.458608 \n", + "12 0.822207 \n", + "13 0.566447 \n", + "14 0.803231 \n", + "\n", + " harness_eval_run_gpt4_metric_faithfulness \n", + "0 1.000000 \n", + "1 1.000000 \n", + "2 1.000000 \n", + "3 0.857143 \n", + "4 1.000000 \n", + "5 1.000000 \n", + "6 1.000000 \n", + "7 0.375000 \n", + "8 1.000000 \n", + "9 0.000000 \n", + "10 0.714286 \n", + "11 1.000000 \n", + "12 1.000000 \n", + "13 1.000000 \n", + "14 0.000000 " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness_eval_run.results.comparative_individual_scores_report(harness_eval_run_gpt4.results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "AOjQW-tKwKYw", + "outputId": "2119fc2c-ca7d-4ed7-eca1-66fb39959cd0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Executing RAG pipeline...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 30/30 [01:50<00:00, 3.67s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Executing evaluation pipeline...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 30/30 [01:05<00:00, 2.18s/it]\n", + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 30/30 [00:26<00:00, 1.12it/s]\n" + ] + } + ], + "source": [ + "overrides = RAGEvaluationOverrides(rag_pipeline={\n", + " \"retriever\": {\"top_k\": 2},\n", + "})\n", + "\n", + "harness_eval_run_topk10 = pipeline_eval_harness.run(inputs=eval_harness_input, run_name=\"harness_eval_run_topk10\", overrides=overrides)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "rr3OCRC5xGju", + "outputId": "633e9c07-8099-4e52-aa87-d9cab6d71abf" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"harness_eval_run_topk10\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"metrics\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"metric_sas\",\n \"metric_faithfulness\",\n \"metric_context_relevance\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19021600145472908,\n \"min\": 0.4,\n \"max\": 0.7799999999999999,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.5743034517082075,\n 0.7799999999999999,\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricsscore
0metric_sas0.574303
1metric_faithfulness0.780000
2metric_context_relevance0.400000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " metrics score\n", + "0 metric_sas 0.574303\n", + "1 metric_faithfulness 0.780000\n", + "2 metric_context_relevance 0.400000" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness_eval_run_topk10.results.score_report()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gKfrFf1CebJJ" + }, + "source": [ + "# Evaluation Frameworks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sSkCqiIgem8X" + }, + "source": [ + "* [RagasEvaluator](https://docs.haystack.deepset.ai/docs/ragasevaluator)\n", + "* [FlowJudge](https://haystack.deepset.ai/integrations/flow-judge)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17, + "referenced_widgets": [ + "c9e01aa81130454a9ff78424e64e7636", + "eea7dd11d5744b0cbaa64b2d1f1e9bb2", + "e167a31fb4a64cf585659c6146d53cd9", + "da651e97b04e4c20a514380acaa6e1a3", + "f27b118b20454f659bedd44f6b8279df", + "653bec3b02a94e358109480df1edc51c", + "f212f062c39440f48a22a64f20e1ea38", + "075cf1cf976d4e7dbe1a90bfb7de3d38", + "f4083eb8314f4f17a6d4d728062854f0", + "d588388158d0420fba6e078fd4e8d9f6", + "ed6d745f8ecc482eb1b14a93ea7dbcb0", + "2e4919f889414ff9b4dc4724dfedb37c", + "2fff778006f94de3bf1f70ea8f6ea6ea", + "6842a54b398b4e0d9faf505a82780e1d", + "5094cd53c72a420681e0246fa2160740", + "f5c00158d342473b8b639d5577968f28", + "93ea390f70cc479992b8bb1a94c4c71c", + "a13eacf3a2574ca4ac34d20d8c239b71", + "1b3290fee5d247ad91bd42f72e7fe7db", + "5e675a39b19d445c928594f098af41eb", + "5491629c609f4cd5a42d2a75d5954eb7", + "ff21aefa2e454d31bdb9e55c7546f14c", + "283317af490040bcb7362374422f18a0", + "52d9654c65034dc6b766de5fc185bc33", + "60c184484456464390e5c5044a5a64c5", + "debba7bd202c48e3976eb5441e635f8c", + "35efa24612bf4b129250e3aa3866855e", + "a0f3a8c7df314fa7ac7811124332e491", + "b10cea675b0f4f68a6fc2a33c2dddfde", + "a949aac0b2a2492da659a058ab6288f6", + "a7c15c86e5fc42aea2f6db2ab5ee3485", + "e8c9bda938294246a6a386c0ca0fce20", + "cdfe62178a1f481eae5b3d8b808ab421", + "98f0797e51574af1b3877c5978c7dd01", + "6906427fd1f647bb99c6a85f46be8910", + "78cac3ef81eb4726bea3dc14be25367f", + "d30b7c0a62bb4ce0a182a6efc6214cd0", + "48d3b77a242848d1a348aa1c4e438bde", + "bc07a01489554fbda675557a32ba5e0c", + "a644bddb21d2440fb25d16ea3d6b487d", + "746cc7c657244ed384e15a8635126b00", + "ab00467118fa4eff8aaf9e0a3a9c8268", + "891acb2e0ed24333b1eaeeca2c605c2c", + "3242e682319740d8ac12b639ce8076f7", + "355cdcb0f55a4e61b02fbdcf261ddcff", + "66c2131a86444ed4900d349e45e0da16", + "712f9ab37cba41ddad32a1bdf89e97d1", + "02cc9dc20c1d42abaca00d0429dacd78", + "522f8c15d5ed4a27b37a24c6483f43d5", + "a42c691f0a7b4abf98c412af86160886", + "24a7829c53ef4689a7a3dd7ea4b60de1", + "100f11799e54486c93087157ca627499", + "6d79148c38d1428f8c1df5a6d8905f43", + "f0bacaaec7c4487798e49877e8117fa6", + "0ec90aaf7961466f8585365d0eca784f", + "cd1a14abf19b4a75adcaa048790693b1", + "9e77f8f7858b4ce7b4a5ce9b26a46321", + "9cdcfb6fb2084fd0a1095e270374ef88", + "46e6fdcee1b64b49aba3c4bf6e8f5cde", + "abef8a13ae594e7ba5227cf92f474f4c", + "7f048f90cb034d7888468a9a36049bfb", + "99f0c94a03954217b4051f398b6123a7", + "434cb11be9fd4df298d21741f28bf804", + "20a902872bdc47cb9012fa7927dc971d", + "bebea714233148288022d06bb0d799a2", + "e1f348ac8ce14a19933d7072028a317f", + "dcb4948d346a4be4891f9adb19600e5a", + "72849571bbc04e3fa04c1b42a2f733cb", + "1ec79510add34f8a8fe4b6684467d16e", + "cb24c59d1fd24494b8da0a1401353345", + "481b5a4d86c647229bd65fdece4c678e", + "7ecf510e5b464640a5a891f79ce267f1", + "a6dd52565f6e441584b420efd4dc5f45", + "2230e9a479e54eccaee61ab62d311cf4", + "531c6c2e46224b17ac1bf496958f95d2", + "0d73a404c2a54a36825f2f575500ec6a", + "a9d95fb50c404ef598b4f9c030b1c04b", + "2a61033ed31d484698d443c1c9dc3a5f", + "08baac6a4c604047817d2e86b5621f4e", + "a712c786e0b349be88e7dc0e44746dce", + "c6fa3b6eb8604694b73c915f235101d3", + "0a0ca849bc2b4898822fd70142cff888", + "cab2c6189e3544a2b027a796c32acb14", + "27c3332383954d1095a077b3f52932e5", + "8ab71d975093439690fafcb717c2b5ba", + "798d4eb219c84e6c8d448aa3a4f8eeb9", + "be764ee2f34d4aea872ad5117e70d9f2", + "48360e5402a14788950612b0f18b46a5", + "a976ee060fb04ae8971828e08d5b2ddb", + "fdb294e653974f278f232a92a323a21f", + "a0b8b2865fdb442983957ae358d4a09f", + "ad7fdd68c08f44f7a77b511680b39378", + "e98a5467c74c4a16b93671ae244adc75", + "8a24fecc97aa4ce1bf7f8e7134c2113e", + "b5418e89b2b4404799101f04b28fdea1", + "8838c44b5cf2466f90cb107996c72098", + "3fd0682779b14b209a2cf429de2be224", + "3012252aa33141ff992146d7b6b6e32e", + "efa33506f7a44f2caf5b1617810ee800", + "f70521a3072c4e0386744f90d27ad4f8", + "53d3b4ebe8b645949565830626a0f37a", + "452a6eca65274cb4b1fa9fcb1eb4a626", + "1153bd91d1e24008a615df3a723856d9", + "b8a4d7e645ab450bbdcca6f2045e8baa", + "99badb6d2f3b4a65b623aa1fd4ae0c2c", + "4eadc6efb5384c3088ef10f75c0321f7", + "1012146e9b3c443685cce6d3cb47915c", + "a3b11be65fc4424daf43023625de82ef", + "0cb9da97027a4cbe866c30503c1a05e4", + "596c19632d0148e38240c7319f6fc6e8", + "2eca53c4646643688a5720dc91e2fdf8", + "42c23a38cf9849479501023a1e09efa0", + "875f0a109b3340868f68e8f1b1b163fc", + "d44591b1141043c2b62a120f96d6040e", + "d97ae561482d42d0af876eefc41ac87c", + "fcca40c3558843319a2cb626473d7b30", + "efb9e73513914694912f650d74b79028", + "e8e3035eb0474b339674119feb7a4f30", + "198bc777716249168e5123ff03dcb186", + "7d5989e0fc4f4cdebfd8ec0d5a08c66f", + "67094be0906b497fb196a1a66f425367", + "76ed9a265ea8458c87e1e7b1923632c9", + "9d3d58ee56484950b8771689cb0c2748", + "e14a56494d6a42bc8a74cc5a4fc8e6c9", + "78a8df437dc848b6a2042c87d6e0d07d", + "c690d36170ca4b3f8490ec7f50db580a", + "7fe1225e285b44678bbc1207250e3360", + "1b8986650a9d4776aeac30f7858a42ad", + "e9fcdbef2b0f4dafa32606eedbec2c9e", + "b021e4f565524cef8c50ffb5c7bd69d9", + "da928d7dd89646ef999d93252934cc1f", + "eb0b1f8179af4b319b7751f26f577440", + "d81e7c8d919142debb6160fca76b1ec1", + "705956bf75a04b41a54a2eb6361eda40", + "cda7e4c99a96406bb706e260e78d1b9a", + "a09c30b008f04b73bab0e9d6ef052a10", + "d00e93e92b45477db7f3fce9f8228e35", + "0585796e760047b48f1593bad5fafe9b", + "178103ae87f849309a1fa0d91fc01d19", + "0d7f52784362470b9f2868ac4dc4960c", + "7a2e9f9c890c42e79910e9d26e5b1853", + "ec805f8e515e4162a286ece91efd29c5", + "dc9c0b2f590c4659bb0c63d324e81ada", + "5d9f76fdbe034fc5b2e6c6a1bed959fd", + "3fb3003ff6a14ab1a3c9fc75aa1a63e4", + "392dbaa82a9e479da371d5467a0b1de9", + "af3f3f946884485ebe748362a7ea4fef", + "a05de003bc614774890a70ac382c2b6f", + "258ca52527254d26a54f289ae2df56c3", + "731d797675b74d0f9ae0f21004c56910", + "cfd2787003154b75ab3902309d5605a2", + "5c64d75e179f4e859a13e7acab51500f", + "9c1f443d70904f9d86f32c6f45f78245", + "78333a4099d8475db3ec7db59968c195" + ] + }, + "id": "n6sHf2fUempZ", + "outputId": "4cefb8ea-d0e2-4661-9887-456f23b7d4ca" + }, + "outputs": [], + "source": [ + "from flow_judge.integrations.haystack import HaystackFlowJudge\n", + "from flow_judge.metrics.presets import RESPONSE_FAITHFULNESS_5POINT\n", + "from flow_judge import Hf\n", + "\n", + "model = Hf(flash_attn=False)\n", + "\n", + "flow_judge_evaluator = HaystackFlowJudge(\n", + " metric=RESPONSE_FAITHFULNESS_5POINT,\n", + " model=model,\n", + " progress_bar=True,\n", + " raise_on_failure=True,\n", + " save_results=True,\n", + " fail_on_parse_error=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "KC8vmckRg3i9", + "outputId": "2c07eef7-60fe-49f3-eb6e-adc12a023da0" + }, + "outputs": [], + "source": [ + "from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric\n", + "\n", + "ragas_evaluator= RagasEvaluator(\n", + " metric=RagasMetric.FAITHFULNESS\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "oolLnIzG0z6N" + }, + "outputs": [], + "source": [ + "str_fj_retrieved_context = []\n", + "for context in retrieved_context:\n", + " str_context = [doc.content for doc in context]\n", + " str_fj_retrieved_context.append(\" \".join(str_context)) # [\"\", \"\", ...]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "5q05yiCry-En" + }, + "outputs": [], + "source": [ + "str_retrieved_context = []\n", + "for context in retrieved_context:\n", + " str_context = [doc.content for doc in context]\n", + " str_retrieved_context.append(str_context) # [[\"\", \"\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0, + "referenced_widgets": [ + "1268c19e80b14924864219ec60f19770", + "4773d22c8eeb483cb05d408c9a122240", + "7de92f3ec0e84e6c8d536b8c4a8635da", + "d5cb42a52ac143ee8bf54a06a01ca16e", + "152c377acd4f474196d623f76b76eca2", + "1baf44df004346f59add2579feb6ccb5", + "fc2da2ca084740bab692dd9bfc276908", + "430220d701ee4aaf94e72b155d4fe4e2", + "2bc2dc3cc3a740caa207ef52f32711da", + "595f1aa5b30c4c96b7735c8a3c2bb23c", + "45ff7e6994ee42e1a53c08d1e99963a3" + ] + }, + "collapsed": true, + "id": "LNgcH2Klj-5g", + "outputId": "609cc808-1f73-4f62-de82-47d1d825dd41" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1268c19e80b14924864219ec60f19770", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Evaluating: 0%| | 0/10 [00:00