diff --git a/Makefile b/Makefile index 66f1f377b..33d580fa9 100644 --- a/Makefile +++ b/Makefile @@ -24,14 +24,14 @@ clean: ## Clean all generated files @cd $(GIT_ROOT)/docs && make clean @cd $(GIT_ROOT) || exit 1 @find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -run-ci: format lint type ## Running all CI checks test: ## Run tests @echo "Running tests..." @pytest --nbmake tests/unit $(shell if [ -n "$(k)" ]; then echo "-k $(k)"; fi) test-e2e: ## Run end2end tests echo "running end2end tests..." @pytest --nbmake tests/e2e -s - +run-ci: format lint type test ## Running all CI checks + # Docs docsite: ## Build and serve documentation @echo "Generating reference pages..." diff --git a/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb b/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb index 2a55575f4..2389ab3b7 100644 --- a/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb +++ b/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb @@ -42,7 +42,8 @@ ], "source": [ "from ragas.metrics._simple_criteria import SimpleCriteriaScoreWithReference\n", - "scorer = SimpleCriteriaScoreWithReference(name='random',definition=\"some definition\")\n", + "\n", + "scorer = SimpleCriteriaScoreWithReference(name=\"random\", definition=\"some definition\")\n", "scorer.get_prompts()" ] }, @@ -62,7 +63,7 @@ ], "source": [ "prompts = scorer.get_prompts()\n", - "print(prompts['single_turn_prompt'].to_string())" + "print(prompts[\"single_turn_prompt\"].to_string())" ] }, { @@ -81,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "prompt = scorer.get_prompts()['single_turn_prompt']\n", + "prompt = scorer.get_prompts()[\"single_turn_prompt\"]\n", "prompt.instruction += \"\\nOnly output valid JSON.\"" ] }, @@ -92,9 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "scorer.set_prompts(**{\n", - " 'single_turn_prompt':prompt\n", - "})" + "scorer.set_prompts(**{\"single_turn_prompt\": prompt})" ] }, { @@ -122,7 +121,7 @@ } ], "source": [ - "print(scorer.get_prompts()['single_turn_prompt'].instruction)" + "print(scorer.get_prompts()[\"single_turn_prompt\"].instruction)" ] }, { @@ -153,7 +152,7 @@ } ], "source": [ - "prompt = scorer.get_prompts()['single_turn_prompt']\n", + "prompt = scorer.get_prompts()[\"single_turn_prompt\"]\n", "\n", "prompt.examples" ] @@ -165,7 +164,10 @@ "metadata": {}, "outputs": [], "source": [ - "from ragas.metrics._simple_criteria import SingleTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput" + "from ragas.metrics._simple_criteria import (\n", + " SingleTurnSimpleCriteriaWithReferenceInput,\n", + " SimpleCriteriaOutput,\n", + ")" ] }, { @@ -178,15 +180,15 @@ "new_example = [\n", " (\n", " SingleTurnSimpleCriteriaWithReferenceInput(\n", - " user_input='Who was the first president of the United States?',\n", - " response='Thomas Jefferson was the first president of the United States.',\n", - " criteria='Score responses in range of 0 (low) to 5 (high) based similarity with reference.',\n", - " reference='George Washington was the first president of the United States.'\n", + " user_input=\"Who was the first president of the United States?\",\n", + " response=\"Thomas Jefferson was the first president of the United States.\",\n", + " criteria=\"Score responses in range of 0 (low) to 5 (high) based similarity with reference.\",\n", + " reference=\"George Washington was the first president of the United States.\",\n", " ),\n", " SimpleCriteriaOutput(\n", - " reason='The response incorrectly states Thomas Jefferson instead of George Washington. While both are significant historical figures, the answer does not match the reference.',\n", - " score=2\n", - " )\n", + " reason=\"The response incorrectly states Thomas Jefferson instead of George Washington. While both are significant historical figures, the answer does not match the reference.\",\n", + " score=2,\n", + " ),\n", " )\n", "]" ] @@ -208,9 +210,7 @@ "metadata": {}, "outputs": [], "source": [ - "scorer.set_prompts(**{\n", - " 'single_turn_prompt':prompt\n", - "})" + "scorer.set_prompts(**{\"single_turn_prompt\": prompt})" ] }, { @@ -228,7 +228,7 @@ } ], "source": [ - "print(scorer.get_prompts()['single_turn_prompt'].examples)" + "print(scorer.get_prompts()[\"single_turn_prompt\"].examples)" ] }, { @@ -246,7 +246,7 @@ "metadata": {}, "outputs": [], "source": [ - "scorer.get_prompts()['single_turn_prompt'].to_string()" + "scorer.get_prompts()[\"single_turn_prompt\"].to_string()" ] } ], diff --git a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb index db2f3d557..1a46375a1 100644 --- a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb +++ b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb @@ -73,10 +73,12 @@ "from ragas.prompt import PydanticPrompt\n", "from pydantic import BaseModel, Field\n", "\n", + "\n", "class RefusalInput(BaseModel):\n", - " user_input:str = Field(description=\"the user request\")\n", + " user_input: str = Field(description=\"the user request\")\n", " response: str = Field(description=\"response from AI\")\n", "\n", + "\n", "class RefusalOutput(BaseModel):\n", " refusal: bool = Field(description=\"boolean indicating if request was refused\")\n", "\n", @@ -87,18 +89,20 @@ " output_model = RefusalOutput\n", " examples = [\n", " (\n", - " RefusalInput(user_input=\"Can you tell me how to hack a system?\", \n", - " response=\"I'm sorry, I can't assist with that.\"),\n", - " RefusalOutput(refusal=True)\n", + " RefusalInput(\n", + " user_input=\"Can you tell me how to hack a system?\",\n", + " response=\"I'm sorry, I can't assist with that.\",\n", + " ),\n", + " RefusalOutput(refusal=True),\n", " ),\n", " (\n", - " RefusalInput(user_input=\"What's the weather like today?\", \n", - " response=\"The weather is sunny with a high of 25°C.\"),\n", - " RefusalOutput(refusal=False)\n", - " )\n", - " ]\n", - "\n", - " \n" + " RefusalInput(\n", + " user_input=\"What's the weather like today?\",\n", + " response=\"The weather is sunny with a high of 25°C.\",\n", + " ),\n", + " RefusalOutput(refusal=False),\n", + " ),\n", + " ]" ] }, { @@ -144,14 +148,22 @@ "\n", " async def _single_turn_ascore(self, sample, callbacks):\n", "\n", - " prompt_input = RefusalInput(user_input=sample.user_input, response=sample.response)\n", - " prompt_response = await self.refusal_prompt.generate(data=prompt_input,llm=self.llm)\n", + " prompt_input = RefusalInput(\n", + " user_input=sample.user_input, response=sample.response\n", + " )\n", + " prompt_response = await self.refusal_prompt.generate(\n", + " data=prompt_input, llm=self.llm\n", + " )\n", " return int(prompt_response.refusal)\n", "\n", " async def _multi_turn_ascore(self, sample, callbacks):\n", "\n", " conversations = sample.user_input\n", - " conversations = [message for message in conversations if isinstance(message, AIMessage) or isinstance(message, HumanMessage)]\n", + " conversations = [\n", + " message\n", + " for message in conversations\n", + " if isinstance(message, AIMessage) or isinstance(message, HumanMessage)\n", + " ]\n", "\n", " grouped_messages = []\n", " for msg in conversations:\n", @@ -160,24 +172,19 @@ " elif isinstance(msg, AIMessage) and human_msg:\n", " grouped_messages.append((human_msg, msg))\n", " human_msg = None\n", - " \n", "\n", " grouped_messages = [item for item in grouped_messages if item[0]]\n", " scores = []\n", " for turn in grouped_messages:\n", - " prompt_input = RefusalInput(user_input=turn[0].content, response=turn[1].content)\n", - " prompt_response = await self.refusal_prompt.generate(data=prompt_input,llm=self.llm)\n", + " prompt_input = RefusalInput(\n", + " user_input=turn[0].content, response=turn[1].content\n", + " )\n", + " prompt_response = await self.refusal_prompt.generate(\n", + " data=prompt_input, llm=self.llm\n", + " )\n", " scores.append(prompt_response.refusal)\n", "\n", - " return sum(scores)\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " " + " return sum(scores)" ] }, { @@ -255,21 +262,41 @@ "metadata": {}, "outputs": [], "source": [ - "sample = MultiTurnSample(user_input=[\n", - " HumanMessage(content=\"Hey, book a table at the nearest best Chinese restaurant for 8:00pm\"),\n", - " AIMessage(content=\"Sure, let me find the best options for you.\", tool_calls=[\n", - " ToolCall(name=\"restaurant_search\", args={\"cuisine\": \"Chinese\", \"time\": \"8:00pm\"})\n", - " ]),\n", - " ToolMessage(content=\"Found a few options: 1. Golden Dragon, 2. Jade Palace\"),\n", - " AIMessage(content=\"I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\"),\n", - " HumanMessage(content=\"Let's go with Golden Dragon.\"),\n", - " AIMessage(content=\"Great choice! I'll book a table for 8:00pm at Golden Dragon.\", tool_calls=[\n", - " ToolCall(name=\"restaurant_book\", args={\"name\": \"Golden Dragon\", \"time\": \"8:00pm\"})\n", - " ]),\n", - " ToolMessage(content=\"Table booked at Golden Dragon for 8:00pm.\"),\n", - " AIMessage(content=\"Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\"),\n", - " HumanMessage(content=\"thanks\"),\n", - "])" + "sample = MultiTurnSample(\n", + " user_input=[\n", + " HumanMessage(\n", + " content=\"Hey, book a table at the nearest best Chinese restaurant for 8:00pm\"\n", + " ),\n", + " AIMessage(\n", + " content=\"Sure, let me find the best options for you.\",\n", + " tool_calls=[\n", + " ToolCall(\n", + " name=\"restaurant_search\",\n", + " args={\"cuisine\": \"Chinese\", \"time\": \"8:00pm\"},\n", + " )\n", + " ],\n", + " ),\n", + " ToolMessage(content=\"Found a few options: 1. Golden Dragon, 2. Jade Palace\"),\n", + " AIMessage(\n", + " content=\"I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\"\n", + " ),\n", + " HumanMessage(content=\"Let's go with Golden Dragon.\"),\n", + " AIMessage(\n", + " content=\"Great choice! I'll book a table for 8:00pm at Golden Dragon.\",\n", + " tool_calls=[\n", + " ToolCall(\n", + " name=\"restaurant_book\",\n", + " args={\"name\": \"Golden Dragon\", \"time\": \"8:00pm\"},\n", + " )\n", + " ],\n", + " ),\n", + " ToolMessage(content=\"Table booked at Golden Dragon for 8:00pm.\"),\n", + " AIMessage(\n", + " content=\"Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\"\n", + " ),\n", + " HumanMessage(content=\"thanks\"),\n", + " ]\n", + ")" ] }, { diff --git a/docs/howtos/customizations/run_config.ipynb b/docs/howtos/customizations/run_config.ipynb index 4403c5f1a..954c3d57d 100644 --- a/docs/howtos/customizations/run_config.ipynb +++ b/docs/howtos/customizations/run_config.ipynb @@ -29,13 +29,11 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "\n", "from ragas.run_config import RunConfig\n", "\n", "# increasing max_workers to 64 and timeout to 60 seconds\n", "\n", - "my_run_config=RunConfig(max_workers=64, timeout=60) " + "my_run_config = RunConfig(max_workers=64, timeout=60)" ] }, { @@ -56,15 +54,15 @@ "from datasets import load_dataset\n", "from ragas import evaluate\n", "\n", - "dataset = load_dataset(\"explodinggradients/amnesty_qa\",\"english_v3\")\n", + "dataset = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v3\")\n", "\n", "samples = []\n", - "for row in dataset['eval']:\n", + "for row in dataset[\"eval\"]:\n", " sample = SingleTurnSample(\n", - " user_input=row['user_input'],\n", - " reference=row['reference'],\n", - " response=row['response'],\n", - " retrieved_contexts=row['retrieved_contexts']\n", + " user_input=row[\"user_input\"],\n", + " reference=row[\"reference\"],\n", + " response=row[\"response\"],\n", + " retrieved_contexts=row[\"retrieved_contexts\"],\n", " )\n", " samples.append(sample)\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index 98af97220..bf8093a7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,7 @@ build-backend = "setuptools.build_meta" write_to = "src/ragas/_version.py" [tool.pytest.ini_options] -addopts = "-n 4" -asyncio_default_fixture_loop_scope = "function" \ No newline at end of file +addopts = "-n 0" +asyncio_default_fixture_loop_scope = "function" +[pytest] +testpaths = ["tests"] diff --git a/src/ragas/exceptions.py b/src/ragas/exceptions.py index bbe059c7b..b4e29751f 100644 --- a/src/ragas/exceptions.py +++ b/src/ragas/exceptions.py @@ -19,3 +19,15 @@ class ExceptionInRunner(RagasException): def __init__(self): msg = "The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead." super().__init__(msg) + + +class RagasOutputParserException(RagasException): + """ + Exception raised when the output parser fails to parse the output. + """ + + def __init__(self, num_retries: int): + msg = ( + f"The output parser failed to parse the output after {num_retries} retries." + ) + super().__init__(msg) diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index 7d1bbd0de..36821cdfb 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -48,9 +48,9 @@ def __init__(self, metric: Metric, **kwargs: t.Any): t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm) if isinstance(self.metric, MetricWithEmbeddings): embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings) - t.cast( - MetricWithEmbeddings, self.metric - ).embeddings = LangchainEmbeddingsWrapper(embeddings) + t.cast(MetricWithEmbeddings, self.metric).embeddings = ( + LangchainEmbeddingsWrapper(embeddings) + ) self.metric.init(run_config) assert isinstance( diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index f0b2b3daa..5df59ffb0 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -47,6 +47,7 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: @dataclass class BaseRagasLLM(ABC): run_config: RunConfig = field(default_factory=RunConfig) + multiple_completion_supported: bool = False def set_run_config(self, run_config: RunConfig): self.run_config = run_config diff --git a/src/ragas/llms/output_parser.py b/src/ragas/llms/output_parser.py index 254b96cc2..29b6a7ca7 100644 --- a/src/ragas/llms/output_parser.py +++ b/src/ragas/llms/output_parser.py @@ -53,7 +53,7 @@ def get_json_format_instructions(pydantic_object: t.Type[TBaseModel]) -> str: return resp -class RagasoutputParser(PydanticOutputParser): +class RagasOutputParserOld(PydanticOutputParser): async def aparse( # type: ignore self, result: str, prompt: PromptValue, llm: BaseRagasLLM, max_retries: int = 1 ): diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 93cce2014..d0c599d66 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -8,7 +8,7 @@ from pydantic import BaseModel from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt, PromptValue from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics._faithfulness import ( @@ -39,7 +39,7 @@ class AnswerCorrectnessClassification(BaseModel): _output_instructions = get_json_format_instructions(AnswerCorrectnessClassification) -_output_parser = RagasoutputParser(pydantic_object=AnswerCorrectnessClassification) +_output_parser = RagasOutputParserOld(pydantic_object=AnswerCorrectnessClassification) CORRECTNESS_INSTRUCTIONS = """\ Given a ground truth and an answer statements, analyze each statement and classify them in one of the following categories: diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 74e2e0c74..50d787357 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -22,7 +22,6 @@ from langchain_core.callbacks import Callbacks - class ResponseRelevanceOutput(BaseModel): question: str noncommittal: int @@ -137,7 +136,9 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: responses = [] for _ in range(self.strictness): response = await self.question_generation.generate( - data=prompt_input, llm=self.llm, callbacks=callbacks, + data=prompt_input, + llm=self.llm, + callbacks=callbacks, ) responses.append(response) diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py index a1cf78943..10d3725bd 100644 --- a/src/ragas/metrics/_context_entities_recall.py +++ b/src/ragas/metrics/_context_entities_recall.py @@ -9,7 +9,7 @@ from langchain.pydantic_v1 import BaseModel from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric @@ -26,7 +26,7 @@ class ContextEntitiesResponse(BaseModel): _output_instructions = get_json_format_instructions( pydantic_object=ContextEntitiesResponse # type: ignore ) -_output_parser = RagasoutputParser(pydantic_object=ContextEntitiesResponse) +_output_parser = RagasOutputParserOld(pydantic_object=ContextEntitiesResponse) TEXT_ENTITY_EXTRACTION = Prompt( diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 4262c2cf0..32ba34cd6 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -8,7 +8,7 @@ from langchain.pydantic_v1 import BaseModel, Field from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt, PromptValue from ragas.metrics._string import NonLLMStringSimilarity from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler @@ -35,7 +35,7 @@ class ContextPrecisionVerifications(BaseModel): _verification_output_instructions = get_json_format_instructions( ContextPrecisionVerification # type: ignore ) -_output_parser = RagasoutputParser(pydantic_object=ContextPrecisionVerification) +_output_parser = RagasOutputParserOld(pydantic_object=ContextPrecisionVerification) CONTEXT_PRECISION = Prompt( name="context_precision", diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index b2f95f133..724b406f8 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, RootModel from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt from ragas.metrics._string import NonLLMStringSimilarity from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler @@ -39,7 +39,9 @@ def dicts(self) -> t.List[t.Dict]: _classification_output_instructions = get_json_format_instructions( ContextRecallClassificationAnswers ) -_output_parser = RagasoutputParser(pydantic_object=ContextRecallClassificationAnswers) +_output_parser = RagasOutputParserOld( + pydantic_object=ContextRecallClassificationAnswers +) CONTEXT_RECALL_RA = Prompt( diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 2a756c229..1d4b7ba1e 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, RootModel from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt from ragas.metrics.base import ( MetricType, @@ -44,7 +44,7 @@ class StatementsAnswers(RootModel): _statements_output_instructions = get_json_format_instructions(StatementsAnswers) -_statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers) +_statements_output_parser = RagasOutputParserOld(pydantic_object=StatementsAnswers) LONG_FORM_ANSWER_PROMPT = Prompt( @@ -101,7 +101,7 @@ def dicts(self): _faithfulness_output_instructions = get_json_format_instructions( StatementFaithfulnessAnswers ) -_faithfulness_output_parser = RagasoutputParser( +_faithfulness_output_parser = RagasOutputParserOld( pydantic_object=StatementFaithfulnessAnswers ) diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index 74a5a9175..90b4db3ed 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -8,7 +8,7 @@ from langchain.pydantic_v1 import BaseModel from ragas.dataset_schema import SingleTurnSample -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions from ragas.llms.prompt import Prompt from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric @@ -39,13 +39,13 @@ class GenerateAnswersResponse(BaseModel): _output_instructions_keyphrase_extraction = get_json_format_instructions( pydantic_object=ExtractKeyphrasesResponse # type: ignore ) -_output_parser_question_generation = RagasoutputParser( +_output_parser_question_generation = RagasOutputParserOld( pydantic_object=GenerateQuestionsResponse ) -_output_parser_answer_generation = RagasoutputParser( +_output_parser_answer_generation = RagasOutputParserOld( pydantic_object=GenerateAnswersResponse ) -_output_parser_keyphrase_extraction = RagasoutputParser( +_output_parser_keyphrase_extraction = RagasOutputParserOld( pydantic_object=ExtractKeyphrasesResponse ) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index b8588b839..81e459b79 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -15,28 +15,23 @@ from dataclasses import dataclass, field from enum import Enum +from pysbd import Segmenter + from ragas.callbacks import new_group from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running +from ragas.prompt import PromptMixin from ragas.run_config import RunConfig -from ragas.utils import deprecated +from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, deprecated if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM - -from pysbd import Segmenter -from pysbd.languages import LANGUAGE_CODES - -from ragas.prompt import PromptMixin - logger = logging.getLogger(__name__) -LANGUAGE_CODES = {v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()} - VALID_COLUMNS = [ "user_input", "retrieved_contexts", @@ -58,8 +53,7 @@ class Metric(ABC): @property @abstractmethod - def name(self) -> str: - ... + def name(self) -> str: ... @property def required_columns(self) -> t.Dict[str, t.Set[str]]: @@ -146,8 +140,7 @@ async def ascore( return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - ... + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ... @dataclass @@ -245,8 +238,7 @@ async def _single_turn_ascore( self, sample: SingleTurnSample, callbacks: Callbacks, - ) -> float: - ... + ) -> float: ... class MultiTurnMetric(Metric): @@ -311,8 +303,7 @@ async def _multi_turn_ascore( self, sample: MultiTurnSample, callbacks: Callbacks, - ) -> float: - ... + ) -> float: ... class Ensember: @@ -358,12 +349,14 @@ def get_segmenter( Get a sentence segmenter for a given language """ language = language.lower() - if language not in LANGUAGE_CODES: + if language not in RAGAS_SUPPORTED_LANGUAGE_CODES: raise ValueError( - f"Language '{language}' not supported. Supported languages: {LANGUAGE_CODES.keys()}" + f"Language '{language}' not supported. Supported languages: {RAGAS_SUPPORTED_LANGUAGE_CODES.keys()}" ) return Segmenter( - language=LANGUAGE_CODES[language], clean=clean, char_span=char_span + language=RAGAS_SUPPORTED_LANGUAGE_CODES[language], + clean=clean, + char_span=char_span, ) diff --git a/src/ragas/prompt.py b/src/ragas/prompt.py deleted file mode 100644 index 6c3ddd917..000000000 --- a/src/ragas/prompt.py +++ /dev/null @@ -1,213 +0,0 @@ -from __future__ import annotations - -import inspect -import typing as t -from abc import ABC, abstractmethod - -import pydantic - -# Check Pydantic version -from pydantic import BaseModel - -from ragas.llms.output_parser import RagasoutputParser -from ragas.llms.prompt import PromptValue - -if t.TYPE_CHECKING: - from langchain_core.callbacks import Callbacks - - from ragas.llms.base import BaseRagasLLM - -PYDANTIC_V2 = pydantic.VERSION.startswith("2.") - - -class BasePrompt(ABC): - @abstractmethod - async def generate( - self, - llm: BaseRagasLLM, - data: t.Any, - n: int = 1, - temperature: t.Optional[float] = None, - stop: t.Optional[t.List[str]] = None, - callbacks: Callbacks = [], - ) -> t.Any: - pass - - -def model_to_dict( - model: BaseModel, - by_alias: bool = False, - exclude_unset: bool = False, - exclude_defaults: bool = False, -) -> t.Dict[str, t.Any]: - if PYDANTIC_V2: - return model.model_dump( # type: ignore - by_alias=by_alias, - exclude_unset=exclude_unset, - exclude_defaults=exclude_defaults, - ) - else: - return model.dict( - by_alias=by_alias, - exclude_unset=exclude_unset, - exclude_defaults=exclude_defaults, - ) - - -def to_json(model: t.Any, indent: int = 4) -> str: - if PYDANTIC_V2: - # Pydantic 2.x - return model.model_dump_json(indent=indent) - else: - # Pydantic 1.x - return model.json(indent=indent) - - -def model_to_json_schema(model: t.Type[BaseModel]) -> str: - if PYDANTIC_V2: - # NOTE: this is not the same as model.schema_json() - return model.model_json_schema() # type: ignore - else: - return model.schema_json() - - -InputModel = t.TypeVar("InputModel", bound=BaseModel) -OutputModel = t.TypeVar("OutputModel", bound=BaseModel) - - -class StringIO(BaseModel): - text: str - - -class BoolIO(BaseModel): - value: bool - - -class PydanticPrompt(BasePrompt, t.Generic[InputModel, OutputModel]): - input_model: t.Type[InputModel] - output_model: t.Type[OutputModel] - instruction: str - examples: t.List[t.Tuple[InputModel, OutputModel]] = [] - - def generate_instruction(self) -> str: - return self.instruction - - def generate_output_signature(self, indent: int = 4) -> str: - schema = model_to_json_schema(self.output_model) - return ( - f"Please return the output in a JSON format that complies with the " - f"following schema as specified in JSON Schema and OpenAPI specification:\n" - f"{schema}" - ) - - def generate_examples(self): - if self.examples: - example_strings = [] - for e in self.examples: - input_data, output_data = e - example_strings.append( - self.instruction - + "\n" - + "input: " - + to_json(input_data, indent=4) - + "\n" - + "output: " - + to_json(output_data, indent=4) - ) - - return ( - "These are some examples to show how to perform the above instruction\n" - + "\n\n".join(example_strings) - ) - # if no examples are provided - else: - return "" - - def to_string(self, data: InputModel) -> str: - # this needs a check - return ( - self.generate_instruction() - + "\n" - + self.generate_output_signature() - + "\n" - + self.generate_examples() - + "\nNow perform the above instruction with the following input\n" - + "input: " - + to_json(data, indent=4) - + "\n" - + "output: " - ) - - async def generate( - self, - llm: BaseRagasLLM, - data: InputModel, - n: int = 1, - temperature: t.Optional[float] = None, - stop: t.Optional[t.List[str]] = None, - callbacks: Callbacks = [], - ) -> OutputModel: - processed_data = self.process_input(data) - prompt_value = PromptValue(prompt_str=self.to_string(processed_data)) - resp = await llm.generate( - prompt_value, - n=n, - temperature=temperature, - stop=stop, - callbacks=callbacks, - ) - resp_text = resp.generations[0][0].text - parser = RagasoutputParser(pydantic_object=self.output_model) - answer = await parser.aparse(resp_text, prompt_value, llm, max_retries=3) - - # TODO: make sure RagasOutputPraser returns the same type as OutputModel - return self.process_output(answer, data) # type: ignore - - def process_input(self, input: InputModel) -> InputModel: - return input - - def process_output(self, output: OutputModel, input: InputModel) -> OutputModel: - return output - - -class StringPrompt(BasePrompt): - async def generate( - self, - llm: BaseRagasLLM, - data: str, - n: int = 1, - temperature: t.Optional[float] = None, - stop: t.Optional[t.List[str]] = None, - callbacks: Callbacks = [], - ) -> str: - prompt_value = PromptValue(prompt_str=data) - llm_result = await llm.agenerate_text( - prompt_value, - n=n, - temperature=temperature, - stop=stop, - callbacks=callbacks, - ) - return llm_result.generations[0][0].text - - -class PromptMixin: - def get_prompts(self) -> t.Dict[str, PydanticPrompt]: - prompts = {} - for name, value in inspect.getmembers(self): - if isinstance(value, PydanticPrompt): - prompts.update({name: value}) - return prompts - - def set_prompts(self, **prompts): - available_prompts = self.get_prompts() - for key, value in prompts.items(): - if key not in available_prompts: - raise ValueError( - f"Prompt with name '{key}' does not exist. Use get_prompts() to see available prompts." - ) - if not isinstance(value, PydanticPrompt): - raise ValueError( - f"Prompt with name '{key}' must be an instance of 'ragas.prompt.PydanticPrompt'" - ) - setattr(self, key, value) diff --git a/src/ragas/prompt/__init__.py b/src/ragas/prompt/__init__.py new file mode 100644 index 000000000..0dfebf6d0 --- /dev/null +++ b/src/ragas/prompt/__init__.py @@ -0,0 +1,11 @@ +from .base import BasePrompt, BoolIO, PydanticPrompt, StringIO, StringPrompt +from .mixin import PromptMixin + +__all__ = [ + "BasePrompt", + "BoolIO", + "PydanticPrompt", + "StringIO", + "StringPrompt", + "PromptMixin", +] diff --git a/src/ragas/prompt/base.py b/src/ragas/prompt/base.py new file mode 100644 index 000000000..674acbf70 --- /dev/null +++ b/src/ragas/prompt/base.py @@ -0,0 +1,384 @@ +from __future__ import annotations + +import logging +import typing as t +from abc import ABC, abstractmethod + +from langchain_core.exceptions import OutputParserException +from langchain_core.output_parsers import PydanticOutputParser +from pydantic import BaseModel + +from ragas.callbacks import new_group +from ragas.exceptions import RagasOutputParserException +from ragas.llms.prompt import PromptValue +from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, camel_to_snake + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.base import BaseRagasLLM + +logger = logging.getLogger(__name__) + + +class BasePrompt(ABC): + def __init__(self, name: t.Optional[str] = None, language: str = "english"): + if name is None: + self.name = camel_to_snake(self.__class__.__name__) + + if language not in RAGAS_SUPPORTED_LANGUAGE_CODES: + raise ValueError( + f"Language '{language}' not supported. Supported languages: {RAGAS_SUPPORTED_LANGUAGE_CODES.keys()}" + ) + self.language = language + + @abstractmethod + async def generate( + self, + llm: BaseRagasLLM, + data: t.Any, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> t.Any: + """ + Generate a single completion from the prompt. + """ + pass + + @abstractmethod + def generate_multiple( + self, + llm: BaseRagasLLM, + data: t.Any, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> t.Any: + """ + Generate multiple completions from the prompt. + """ + pass + + +InputModel = t.TypeVar("InputModel", bound=BaseModel) +OutputModel = t.TypeVar("OutputModel", bound=BaseModel) + + +class StringIO(BaseModel): + text: str + + +class BoolIO(BaseModel): + value: bool + + +class PydanticPrompt(BasePrompt, t.Generic[InputModel, OutputModel]): + input_model: t.Type[InputModel] + output_model: t.Type[OutputModel] + instruction: str + examples: t.List[t.Tuple[InputModel, OutputModel]] = [] + + def _generate_instruction(self) -> str: + return self.instruction + + def _generate_output_signature(self, indent: int = 4) -> str: + return ( + f"Please return the output in a JSON format that complies with the " + f"following schema as specified in JSON Schema and OpenAPI specification:\n" + f"{self.output_model.model_json_schema()}" + ) + + def _generate_examples(self): + if self.examples: + example_strings = [] + for e in self.examples: + input_data, output_data = e + example_strings.append( + self.instruction + + "\n" + + "input: " + + input_data.model_dump_json(indent=4) + + "\n" + + "output: " + + output_data.model_dump_json(indent=4) + ) + + return ( + "These are some examples to show how to perform the above instruction\n" + + "\n\n".join(example_strings) + ) + # if no examples are provided + else: + return "" + + def to_string(self, data: InputModel) -> str: + # this needs a check + return ( + self._generate_instruction() + + "\n" + + self._generate_output_signature() + + "\n" + + self._generate_examples() + + "\nNow perform the above instruction with the following input\n" + + "input: " + + data.model_dump_json(indent=4) + + "\n" + + "output: " + ) + + async def generate( + self, + llm: BaseRagasLLM, + data: InputModel, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + ) -> OutputModel: + """ + Generate a single output using the provided language model and input data. + + This method is a special case of `generate_multiple` where only one output is generated. + + Parameters + ---------- + llm : BaseRagasLLM + The language model to use for generation. + data : InputModel + The input data for generation. + temperature : float, optional + The temperature parameter for controlling randomness in generation. + stop : List[str], optional + A list of stop sequences to end generation. + callbacks : Callbacks, optional + Callback functions to be called during the generation process. + + Returns + ------- + OutputModel + The generated output. + + Notes + ----- + This method internally calls `generate_multiple` with `n=1` and returns the first (and only) result. + """ + callbacks = callbacks or [] + + # this is just a special case of generate_multiple + output_single = await self.generate_multiple( + llm=llm, + data=data, + n=1, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + return output_single[0] + + async def generate_multiple( + self, + llm: BaseRagasLLM, + data: InputModel, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + ) -> t.List[OutputModel]: + """ + Generate multiple outputs using the provided language model and input data. + + Parameters + ---------- + llm : BaseRagasLLM + The language model to use for generation. + data : InputModel + The input data for generation. + n : int, optional + The number of outputs to generate. Default is 1. + temperature : float, optional + The temperature parameter for controlling randomness in generation. + stop : List[str], optional + A list of stop sequences to end generation. + callbacks : Callbacks, optional + Callback functions to be called during the generation process. + + Returns + ------- + List[OutputModel] + A list of generated outputs. + + Raises + ------ + RagasOutputParserException + If there's an error parsing the output. + """ + callbacks = callbacks or [] + processed_data = self.process_input(data) + prompt_rm, prompt_cb = new_group( + name=self.name, + inputs={"data": processed_data}, + callbacks=callbacks, + ) + prompt_value = PromptValue(prompt_str=self.to_string(processed_data)) + resp = await llm.generate( + prompt_value, + n=n, + temperature=temperature, + stop=stop, + callbacks=prompt_cb, + ) + + output_models = [] + parser = RagasOutputParser(pydantic_object=self.output_model) + for i in range(n): + output_string = resp.generations[0][i].text + try: + answer = await parser.parse_output_string( + output_string=output_string, + prompt_value=prompt_value, + llm=llm, + callbacks=prompt_cb, + max_retries=3, + ) + processed_output = self.process_output(answer, data) # type: ignore + output_models.append(processed_output) + except RagasOutputParserException as e: + prompt_rm.on_chain_error(error=e) + logger.error("Prompt %s failed to parse output: %s", self.name, e) + raise e + + prompt_rm.on_chain_end({"output": output_models}) + return output_models + + def process_input(self, input: InputModel) -> InputModel: + return input + + def process_output(self, output: OutputModel, input: InputModel) -> OutputModel: + return output + + +class StringPrompt(BasePrompt): + """ + A simple prompt that can be formatted with additional data using f-string syntax. + + This prompt is a simpler alternative to PydanticPrompt for those who prefer a more + flexible approach without the need for a Pydantic model. + + Parameters + ---------- + instruction : str + The instruction string that can be formatted with additional data. + + Examples + -------- + >>> from ragas.prompt import string_prompt + >>> await prompt.generate(llm=llm, data={"category": "commerce"}) + """ + + async def generate( + self, + llm: BaseRagasLLM, + data: str, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> str: + """ + Generate text based on the instruction and provided data. + + Parameters + ---------- + llm : BaseRagasLLM + The language model to use for text generation. + data : Optional[Dict[str, Any]], optional + The data to format the instruction with, by default None. + n : int, optional + The number of completions to generate, by default 1. + temperature : Optional[float], optional + The temperature for text generation, by default None. + stop : Optional[List[str]], optional + The stop sequences for text generation, by default None. + callbacks : Callbacks, optional + The callbacks to use during text generation, by default []. + + Returns + ------- + str + The generated text. + """ + llm_result = await llm.agenerate_text( + PromptValue(prompt_str=data), + n=1, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + return llm_result.generations[0][0].text + + async def generate_multiple( + self, + llm: BaseRagasLLM, + data: str, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> t.List[str]: + return [ + await self.generate(llm, data, temperature, stop, callbacks) + for _ in range(n) + ] + + +class OutputStringAndPrompt(BaseModel): + output_string: str + prompt_value: str + + +class FixOutputFormat(PydanticPrompt[OutputStringAndPrompt, StringIO]): + instruction = "The output string did not satisfy the constraints given in the prompt. Fix the output string and return it." + input_model = OutputStringAndPrompt + output_model = StringIO + + +fix_output_format_prompt = FixOutputFormat() + + +class RagasOutputParser(PydanticOutputParser[OutputModel]): + async def parse_output_string( + self, + output_string: str, + prompt_value: PromptValue, + llm: BaseRagasLLM, + callbacks: Callbacks, + max_retries: int = 1, + ): + callbacks = callbacks or [] + try: + result = super().parse(output_string) + except OutputParserException: + if max_retries != 0: + retry_rm, retry_cb = new_group( + name="fix_output_format", + inputs={"output_string": output_string}, + callbacks=callbacks, + ) + fixed_output_string = await fix_output_format_prompt.generate( + llm=llm, + data=OutputStringAndPrompt( + output_string=output_string, + prompt_value=prompt_value.to_string(), + ), + ) + retry_rm.on_chain_end({"fixed_output_string": fixed_output_string}) + return await self.parse_output_string( + output_string=fixed_output_string.text, + prompt_value=prompt_value, + llm=llm, + max_retries=max_retries - 1, + callbacks=callbacks, + ) + else: + raise RagasOutputParserException(num_retries=max_retries) + return result diff --git a/src/ragas/prompt/mixin.py b/src/ragas/prompt/mixin.py new file mode 100644 index 000000000..c577fbbe9 --- /dev/null +++ b/src/ragas/prompt/mixin.py @@ -0,0 +1,26 @@ +import inspect +import typing as t + +from .base import PydanticPrompt + + +class PromptMixin: + def get_prompts(self) -> t.Dict[str, PydanticPrompt]: + prompts = {} + for name, value in inspect.getmembers(self): + if isinstance(value, PydanticPrompt): + prompts.update({name: value}) + return prompts + + def set_prompts(self, **prompts): + available_prompts = self.get_prompts() + for key, value in prompts.items(): + if key not in available_prompts: + raise ValueError( + f"Prompt with name '{key}' does not exist. Use get_prompts() to see available prompts." + ) + if not isinstance(value, PydanticPrompt): + raise ValueError( + f"Prompt with name '{key}' must be an instance of 'ragas.prompt.PydanticPrompt'" + ) + setattr(self, key, value) diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index 09d73a4a8..39b9bf0a4 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -108,7 +108,6 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]): ] - class NamedEntities(BaseModel): ORG: t.List[str] LOC: t.List[str] @@ -141,7 +140,6 @@ class NERPrompt(PydanticPrompt[StringIO, NEROutput]): ] - @dataclass class SummaryExtractor(LLMBasedExtractor): property_name: str = "summary" @@ -207,4 +205,3 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Dict[str, t.List[str]]]: return self.property_name, {} result = await self.prompt.generate(self.llm, data=StringIO(text=node_text)) return self.property_name, result.entities.model_dump() - diff --git a/src/ragas/utils.py b/src/ragas/utils.py index bca9da871..92a3bc73e 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -2,17 +2,22 @@ import logging import os +import re import typing as t import warnings from functools import lru_cache import numpy as np from datasets import Dataset +from pysbd.languages import LANGUAGE_CODES if t.TYPE_CHECKING: from ragas.metrics.base import Metric DEBUG_ENV_VAR = "RAGAS_DEBUG" +RAGAS_SUPPORTED_LANGUAGE_CODES = { + v.__name__.lower(): k for k, v in LANGUAGE_CODES.items() +} @lru_cache(maxsize=1) @@ -207,3 +212,12 @@ def convert_v1_to_v2_dataset(dataset: Dataset) -> Dataset: def convert_v2_to_v1_dataset(dataset: Dataset) -> Dataset: columns_map = {k: v for k, v in REQUIRED_COLS_v1.items() if k in dataset.features} return dataset.rename_columns(columns_map) + + +def camel_to_snake(name): + """ + Convert a camelCase string to snake_case. + eg: HaiThere -> hai_there + """ + pattern = re.compile(r"(? hello\nworld -> world" + + +def test_prompt_hash(): + from ragas.prompt import StringPrompt + + class Prompt(StringPrompt): + instruction = "You are a helpful assistant." + + p = Prompt() + assert hash(p) == hash(p) + p.instruction = "You are a helpful assistant. And some more" + # assert hash(p) != hash(p) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index c14554ea6..71e1803b4 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -37,3 +37,17 @@ def test_check_if_sum_is_close(values, close_to, num_places): ) def test_get_from_dict(data_dict, key, expected): assert get_from_dict(data_dict, key) == expected + + +@pytest.mark.parametrize( + ["camel_case_string", "expected"], + [ + ("myVariableName", "my_variable_name"), + ("CamelCaseString", "camel_case_string"), + ("AnotherCamelCaseString", "another_camel_case_string"), + ], +) +def test_camel_to_snake(camel_case_string, expected): + from ragas.utils import camel_to_snake + + assert camel_to_snake(camel_case_string) == expected