diff --git a/Makefile b/Makefile
index 66f1f377b..33d580fa9 100644
--- a/Makefile
+++ b/Makefile
@@ -24,14 +24,14 @@ clean: ## Clean all generated files
 	@cd $(GIT_ROOT)/docs && make clean
 	@cd $(GIT_ROOT) || exit 1
 	@find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
-run-ci: format lint type ## Running all CI checks
 test: ## Run tests
 	@echo "Running tests..."
 	@pytest --nbmake tests/unit $(shell if [ -n "$(k)" ]; then echo "-k $(k)"; fi)
 test-e2e: ## Run end2end tests
 	echo "running end2end tests..."
 	@pytest --nbmake tests/e2e -s
-	
+run-ci: format lint type test ## Running all CI checks
+
 # Docs
 docsite: ## Build and serve documentation
 	@echo "Generating reference pages..."
diff --git a/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb b/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb
index 2a55575f4..2389ab3b7 100644
--- a/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb
+++ b/docs/howtos/customizations/metrics/modifying-prompts-metrics.ipynb
@@ -42,7 +42,8 @@
    ],
    "source": [
     "from ragas.metrics._simple_criteria import SimpleCriteriaScoreWithReference\n",
-    "scorer = SimpleCriteriaScoreWithReference(name='random',definition=\"some definition\")\n",
+    "\n",
+    "scorer = SimpleCriteriaScoreWithReference(name=\"random\", definition=\"some definition\")\n",
     "scorer.get_prompts()"
    ]
   },
@@ -62,7 +63,7 @@
    ],
    "source": [
     "prompts = scorer.get_prompts()\n",
-    "print(prompts['single_turn_prompt'].to_string())"
+    "print(prompts[\"single_turn_prompt\"].to_string())"
    ]
   },
   {
@@ -81,7 +82,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prompt = scorer.get_prompts()['single_turn_prompt']\n",
+    "prompt = scorer.get_prompts()[\"single_turn_prompt\"]\n",
     "prompt.instruction += \"\\nOnly output valid JSON.\""
    ]
   },
@@ -92,9 +93,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scorer.set_prompts(**{\n",
-    "    'single_turn_prompt':prompt\n",
-    "})"
+    "scorer.set_prompts(**{\"single_turn_prompt\": prompt})"
    ]
   },
   {
@@ -122,7 +121,7 @@
     }
    ],
    "source": [
-    "print(scorer.get_prompts()['single_turn_prompt'].instruction)"
+    "print(scorer.get_prompts()[\"single_turn_prompt\"].instruction)"
    ]
   },
   {
@@ -153,7 +152,7 @@
     }
    ],
    "source": [
-    "prompt = scorer.get_prompts()['single_turn_prompt']\n",
+    "prompt = scorer.get_prompts()[\"single_turn_prompt\"]\n",
     "\n",
     "prompt.examples"
    ]
@@ -165,7 +164,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ragas.metrics._simple_criteria import SingleTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput"
+    "from ragas.metrics._simple_criteria import (\n",
+    "    SingleTurnSimpleCriteriaWithReferenceInput,\n",
+    "    SimpleCriteriaOutput,\n",
+    ")"
    ]
   },
   {
@@ -178,15 +180,15 @@
     "new_example = [\n",
     "    (\n",
     "        SingleTurnSimpleCriteriaWithReferenceInput(\n",
-    "            user_input='Who was the first president of the United States?',\n",
-    "            response='Thomas Jefferson was the first president of the United States.',\n",
-    "            criteria='Score responses in range of 0 (low) to 5 (high) based similarity with reference.',\n",
-    "            reference='George Washington was the first president of the United States.'\n",
+    "            user_input=\"Who was the first president of the United States?\",\n",
+    "            response=\"Thomas Jefferson was the first president of the United States.\",\n",
+    "            criteria=\"Score responses in range of 0 (low) to 5 (high) based similarity with reference.\",\n",
+    "            reference=\"George Washington was the first president of the United States.\",\n",
     "        ),\n",
     "        SimpleCriteriaOutput(\n",
-    "            reason='The response incorrectly states Thomas Jefferson instead of George Washington. While both are significant historical figures, the answer does not match the reference.',\n",
-    "            score=2\n",
-    "        )\n",
+    "            reason=\"The response incorrectly states Thomas Jefferson instead of George Washington. While both are significant historical figures, the answer does not match the reference.\",\n",
+    "            score=2,\n",
+    "        ),\n",
     "    )\n",
     "]"
    ]
@@ -208,9 +210,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scorer.set_prompts(**{\n",
-    "    'single_turn_prompt':prompt\n",
-    "})"
+    "scorer.set_prompts(**{\"single_turn_prompt\": prompt})"
    ]
   },
   {
@@ -228,7 +228,7 @@
     }
    ],
    "source": [
-    "print(scorer.get_prompts()['single_turn_prompt'].examples)"
+    "print(scorer.get_prompts()[\"single_turn_prompt\"].examples)"
    ]
   },
   {
@@ -246,7 +246,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scorer.get_prompts()['single_turn_prompt'].to_string()"
+    "scorer.get_prompts()[\"single_turn_prompt\"].to_string()"
    ]
   }
  ],
diff --git a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb
index db2f3d557..1a46375a1 100644
--- a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb
+++ b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb
@@ -73,10 +73,12 @@
     "from ragas.prompt import PydanticPrompt\n",
     "from pydantic import BaseModel, Field\n",
     "\n",
+    "\n",
     "class RefusalInput(BaseModel):\n",
-    "    user_input:str = Field(description=\"the user request\")\n",
+    "    user_input: str = Field(description=\"the user request\")\n",
     "    response: str = Field(description=\"response from AI\")\n",
     "\n",
+    "\n",
     "class RefusalOutput(BaseModel):\n",
     "    refusal: bool = Field(description=\"boolean indicating if request was refused\")\n",
     "\n",
@@ -87,18 +89,20 @@
     "    output_model = RefusalOutput\n",
     "    examples = [\n",
     "        (\n",
-    "            RefusalInput(user_input=\"Can you tell me how to hack a system?\", \n",
-    "                         response=\"I'm sorry, I can't assist with that.\"),\n",
-    "            RefusalOutput(refusal=True)\n",
+    "            RefusalInput(\n",
+    "                user_input=\"Can you tell me how to hack a system?\",\n",
+    "                response=\"I'm sorry, I can't assist with that.\",\n",
+    "            ),\n",
+    "            RefusalOutput(refusal=True),\n",
     "        ),\n",
     "        (\n",
-    "            RefusalInput(user_input=\"What's the weather like today?\", \n",
-    "                         response=\"The weather is sunny with a high of 25°C.\"),\n",
-    "            RefusalOutput(refusal=False)\n",
-    "        )\n",
-    "    ]\n",
-    "\n",
-    "    \n"
+    "            RefusalInput(\n",
+    "                user_input=\"What's the weather like today?\",\n",
+    "                response=\"The weather is sunny with a high of 25°C.\",\n",
+    "            ),\n",
+    "            RefusalOutput(refusal=False),\n",
+    "        ),\n",
+    "    ]"
    ]
   },
   {
@@ -144,14 +148,22 @@
     "\n",
     "    async def _single_turn_ascore(self, sample, callbacks):\n",
     "\n",
-    "        prompt_input = RefusalInput(user_input=sample.user_input, response=sample.response)\n",
-    "        prompt_response = await self.refusal_prompt.generate(data=prompt_input,llm=self.llm)\n",
+    "        prompt_input = RefusalInput(\n",
+    "            user_input=sample.user_input, response=sample.response\n",
+    "        )\n",
+    "        prompt_response = await self.refusal_prompt.generate(\n",
+    "            data=prompt_input, llm=self.llm\n",
+    "        )\n",
     "        return int(prompt_response.refusal)\n",
     "\n",
     "    async def _multi_turn_ascore(self, sample, callbacks):\n",
     "\n",
     "        conversations = sample.user_input\n",
-    "        conversations = [message for message in conversations if isinstance(message, AIMessage) or isinstance(message, HumanMessage)]\n",
+    "        conversations = [\n",
+    "            message\n",
+    "            for message in conversations\n",
+    "            if isinstance(message, AIMessage) or isinstance(message, HumanMessage)\n",
+    "        ]\n",
     "\n",
     "        grouped_messages = []\n",
     "        for msg in conversations:\n",
@@ -160,24 +172,19 @@
     "            elif isinstance(msg, AIMessage) and human_msg:\n",
     "                grouped_messages.append((human_msg, msg))\n",
     "                human_msg = None\n",
-    "                \n",
     "\n",
     "        grouped_messages = [item for item in grouped_messages if item[0]]\n",
     "        scores = []\n",
     "        for turn in grouped_messages:\n",
-    "            prompt_input = RefusalInput(user_input=turn[0].content, response=turn[1].content)\n",
-    "            prompt_response = await self.refusal_prompt.generate(data=prompt_input,llm=self.llm)\n",
+    "            prompt_input = RefusalInput(\n",
+    "                user_input=turn[0].content, response=turn[1].content\n",
+    "            )\n",
+    "            prompt_response = await self.refusal_prompt.generate(\n",
+    "                data=prompt_input, llm=self.llm\n",
+    "            )\n",
     "            scores.append(prompt_response.refusal)\n",
     "\n",
-    "        return sum(scores)\n",
-    "            \n",
-    "            \n",
-    "        \n",
-    "\n",
-    "        \n",
-    "    \n",
-    "    \n",
-    "        "
+    "        return sum(scores)"
    ]
   },
   {
@@ -255,21 +262,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sample = MultiTurnSample(user_input=[\n",
-    "    HumanMessage(content=\"Hey, book a table at the nearest best Chinese restaurant for 8:00pm\"),\n",
-    "    AIMessage(content=\"Sure, let me find the best options for you.\", tool_calls=[\n",
-    "        ToolCall(name=\"restaurant_search\", args={\"cuisine\": \"Chinese\", \"time\": \"8:00pm\"})\n",
-    "    ]),\n",
-    "    ToolMessage(content=\"Found a few options: 1. Golden Dragon, 2. Jade Palace\"),\n",
-    "    AIMessage(content=\"I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\"),\n",
-    "    HumanMessage(content=\"Let's go with Golden Dragon.\"),\n",
-    "    AIMessage(content=\"Great choice! I'll book a table for 8:00pm at Golden Dragon.\", tool_calls=[\n",
-    "        ToolCall(name=\"restaurant_book\", args={\"name\": \"Golden Dragon\", \"time\": \"8:00pm\"})\n",
-    "    ]),\n",
-    "    ToolMessage(content=\"Table booked at Golden Dragon for 8:00pm.\"),\n",
-    "    AIMessage(content=\"Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\"),\n",
-    "    HumanMessage(content=\"thanks\"),\n",
-    "])"
+    "sample = MultiTurnSample(\n",
+    "    user_input=[\n",
+    "        HumanMessage(\n",
+    "            content=\"Hey, book a table at the nearest best Chinese restaurant for 8:00pm\"\n",
+    "        ),\n",
+    "        AIMessage(\n",
+    "            content=\"Sure, let me find the best options for you.\",\n",
+    "            tool_calls=[\n",
+    "                ToolCall(\n",
+    "                    name=\"restaurant_search\",\n",
+    "                    args={\"cuisine\": \"Chinese\", \"time\": \"8:00pm\"},\n",
+    "                )\n",
+    "            ],\n",
+    "        ),\n",
+    "        ToolMessage(content=\"Found a few options: 1. Golden Dragon, 2. Jade Palace\"),\n",
+    "        AIMessage(\n",
+    "            content=\"I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\"\n",
+    "        ),\n",
+    "        HumanMessage(content=\"Let's go with Golden Dragon.\"),\n",
+    "        AIMessage(\n",
+    "            content=\"Great choice! I'll book a table for 8:00pm at Golden Dragon.\",\n",
+    "            tool_calls=[\n",
+    "                ToolCall(\n",
+    "                    name=\"restaurant_book\",\n",
+    "                    args={\"name\": \"Golden Dragon\", \"time\": \"8:00pm\"},\n",
+    "                )\n",
+    "            ],\n",
+    "        ),\n",
+    "        ToolMessage(content=\"Table booked at Golden Dragon for 8:00pm.\"),\n",
+    "        AIMessage(\n",
+    "            content=\"Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\"\n",
+    "        ),\n",
+    "        HumanMessage(content=\"thanks\"),\n",
+    "    ]\n",
+    ")"
    ]
   },
   {
diff --git a/docs/howtos/customizations/run_config.ipynb b/docs/howtos/customizations/run_config.ipynb
index 4403c5f1a..954c3d57d 100644
--- a/docs/howtos/customizations/run_config.ipynb
+++ b/docs/howtos/customizations/run_config.ipynb
@@ -29,13 +29,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "from ragas.run_config import RunConfig\n",
     "\n",
     "# increasing max_workers to 64 and timeout to 60 seconds\n",
     "\n",
-    "my_run_config=RunConfig(max_workers=64, timeout=60) "
+    "my_run_config = RunConfig(max_workers=64, timeout=60)"
    ]
   },
   {
@@ -56,15 +54,15 @@
     "from datasets import load_dataset\n",
     "from ragas import evaluate\n",
     "\n",
-    "dataset = load_dataset(\"explodinggradients/amnesty_qa\",\"english_v3\")\n",
+    "dataset = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v3\")\n",
     "\n",
     "samples = []\n",
-    "for row in dataset['eval']:\n",
+    "for row in dataset[\"eval\"]:\n",
     "    sample = SingleTurnSample(\n",
-    "        user_input=row['user_input'],\n",
-    "        reference=row['reference'],\n",
-    "        response=row['response'],\n",
-    "        retrieved_contexts=row['retrieved_contexts']\n",
+    "        user_input=row[\"user_input\"],\n",
+    "        reference=row[\"reference\"],\n",
+    "        response=row[\"response\"],\n",
+    "        retrieved_contexts=row[\"retrieved_contexts\"],\n",
     "    )\n",
     "    samples.append(sample)\n",
     "\n",
diff --git a/pyproject.toml b/pyproject.toml
index 98af97220..bf8093a7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,5 +59,7 @@ build-backend = "setuptools.build_meta"
 write_to = "src/ragas/_version.py"
 
 [tool.pytest.ini_options]
-addopts = "-n 4"
-asyncio_default_fixture_loop_scope = "function"
\ No newline at end of file
+addopts = "-n 0"
+asyncio_default_fixture_loop_scope = "function"
+[pytest]
+testpaths = ["tests"]
diff --git a/src/ragas/exceptions.py b/src/ragas/exceptions.py
index bbe059c7b..b4e29751f 100644
--- a/src/ragas/exceptions.py
+++ b/src/ragas/exceptions.py
@@ -19,3 +19,15 @@ class ExceptionInRunner(RagasException):
     def __init__(self):
         msg = "The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead."
         super().__init__(msg)
+
+
+class RagasOutputParserException(RagasException):
+    """
+    Exception raised when the output parser fails to parse the output.
+    """
+
+    def __init__(self, num_retries: int):
+        msg = (
+            f"The output parser failed to parse the output after {num_retries} retries."
+        )
+        super().__init__(msg)
diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
index 7d1bbd0de..36821cdfb 100644
--- a/src/ragas/integrations/langchain.py
+++ b/src/ragas/integrations/langchain.py
@@ -48,9 +48,9 @@ def __init__(self, metric: Metric, **kwargs: t.Any):
             t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
         if isinstance(self.metric, MetricWithEmbeddings):
             embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings)
-            t.cast(
-                MetricWithEmbeddings, self.metric
-            ).embeddings = LangchainEmbeddingsWrapper(embeddings)
+            t.cast(MetricWithEmbeddings, self.metric).embeddings = (
+                LangchainEmbeddingsWrapper(embeddings)
+            )
         self.metric.init(run_config)
 
         assert isinstance(
diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py
index f0b2b3daa..5df59ffb0 100644
--- a/src/ragas/llms/base.py
+++ b/src/ragas/llms/base.py
@@ -47,6 +47,7 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool:
 @dataclass
 class BaseRagasLLM(ABC):
     run_config: RunConfig = field(default_factory=RunConfig)
+    multiple_completion_supported: bool = False
 
     def set_run_config(self, run_config: RunConfig):
         self.run_config = run_config
diff --git a/src/ragas/llms/output_parser.py b/src/ragas/llms/output_parser.py
index 254b96cc2..29b6a7ca7 100644
--- a/src/ragas/llms/output_parser.py
+++ b/src/ragas/llms/output_parser.py
@@ -53,7 +53,7 @@ def get_json_format_instructions(pydantic_object: t.Type[TBaseModel]) -> str:
     return resp
 
 
-class RagasoutputParser(PydanticOutputParser):
+class RagasOutputParserOld(PydanticOutputParser):
     async def aparse(  # type: ignore
         self, result: str, prompt: PromptValue, llm: BaseRagasLLM, max_retries: int = 1
     ):
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index 93cce2014..d0c599d66 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -8,7 +8,7 @@
 from pydantic import BaseModel
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt, PromptValue
 from ragas.metrics._answer_similarity import AnswerSimilarity
 from ragas.metrics._faithfulness import (
@@ -39,7 +39,7 @@ class AnswerCorrectnessClassification(BaseModel):
 
 
 _output_instructions = get_json_format_instructions(AnswerCorrectnessClassification)
-_output_parser = RagasoutputParser(pydantic_object=AnswerCorrectnessClassification)
+_output_parser = RagasOutputParserOld(pydantic_object=AnswerCorrectnessClassification)
 
 CORRECTNESS_INSTRUCTIONS = """\
 Given a ground truth and an answer statements, analyze each statement and classify them in one of the following categories:
diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py
index 74e2e0c74..50d787357 100644
--- a/src/ragas/metrics/_answer_relevance.py
+++ b/src/ragas/metrics/_answer_relevance.py
@@ -22,7 +22,6 @@
     from langchain_core.callbacks import Callbacks
 
 
-
 class ResponseRelevanceOutput(BaseModel):
     question: str
     noncommittal: int
@@ -137,7 +136,9 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         responses = []
         for _ in range(self.strictness):
             response = await self.question_generation.generate(
-                data=prompt_input, llm=self.llm, callbacks=callbacks,
+                data=prompt_input,
+                llm=self.llm,
+                callbacks=callbacks,
             )
             responses.append(response)
 
diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py
index a1cf78943..10d3725bd 100644
--- a/src/ragas/metrics/_context_entities_recall.py
+++ b/src/ragas/metrics/_context_entities_recall.py
@@ -9,7 +9,7 @@
 from langchain.pydantic_v1 import BaseModel
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt
 from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric
 
@@ -26,7 +26,7 @@ class ContextEntitiesResponse(BaseModel):
 _output_instructions = get_json_format_instructions(
     pydantic_object=ContextEntitiesResponse  # type: ignore
 )
-_output_parser = RagasoutputParser(pydantic_object=ContextEntitiesResponse)
+_output_parser = RagasOutputParserOld(pydantic_object=ContextEntitiesResponse)
 
 
 TEXT_ENTITY_EXTRACTION = Prompt(
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
index 4262c2cf0..32ba34cd6 100644
--- a/src/ragas/metrics/_context_precision.py
+++ b/src/ragas/metrics/_context_precision.py
@@ -8,7 +8,7 @@
 from langchain.pydantic_v1 import BaseModel, Field
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt, PromptValue
 from ragas.metrics._string import NonLLMStringSimilarity
 from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler
@@ -35,7 +35,7 @@ class ContextPrecisionVerifications(BaseModel):
 _verification_output_instructions = get_json_format_instructions(
     ContextPrecisionVerification  # type: ignore
 )
-_output_parser = RagasoutputParser(pydantic_object=ContextPrecisionVerification)
+_output_parser = RagasOutputParserOld(pydantic_object=ContextPrecisionVerification)
 
 CONTEXT_PRECISION = Prompt(
     name="context_precision",
diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
index b2f95f133..724b406f8 100644
--- a/src/ragas/metrics/_context_recall.py
+++ b/src/ragas/metrics/_context_recall.py
@@ -8,7 +8,7 @@
 from pydantic import BaseModel, RootModel
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt
 from ragas.metrics._string import NonLLMStringSimilarity
 from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler
@@ -39,7 +39,9 @@ def dicts(self) -> t.List[t.Dict]:
 _classification_output_instructions = get_json_format_instructions(
     ContextRecallClassificationAnswers
 )
-_output_parser = RagasoutputParser(pydantic_object=ContextRecallClassificationAnswers)
+_output_parser = RagasOutputParserOld(
+    pydantic_object=ContextRecallClassificationAnswers
+)
 
 
 CONTEXT_RECALL_RA = Prompt(
diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py
index 2a756c229..1d4b7ba1e 100644
--- a/src/ragas/metrics/_faithfulness.py
+++ b/src/ragas/metrics/_faithfulness.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, Field, RootModel
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt
 from ragas.metrics.base import (
     MetricType,
@@ -44,7 +44,7 @@ class StatementsAnswers(RootModel):
 
 
 _statements_output_instructions = get_json_format_instructions(StatementsAnswers)
-_statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers)
+_statements_output_parser = RagasOutputParserOld(pydantic_object=StatementsAnswers)
 
 
 LONG_FORM_ANSWER_PROMPT = Prompt(
@@ -101,7 +101,7 @@ def dicts(self):
 _faithfulness_output_instructions = get_json_format_instructions(
     StatementFaithfulnessAnswers
 )
-_faithfulness_output_parser = RagasoutputParser(
+_faithfulness_output_parser = RagasOutputParserOld(
     pydantic_object=StatementFaithfulnessAnswers
 )
 
diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py
index 74a5a9175..90b4db3ed 100644
--- a/src/ragas/metrics/_summarization.py
+++ b/src/ragas/metrics/_summarization.py
@@ -8,7 +8,7 @@
 from langchain.pydantic_v1 import BaseModel
 
 from ragas.dataset_schema import SingleTurnSample
-from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.output_parser import RagasOutputParserOld, get_json_format_instructions
 from ragas.llms.prompt import Prompt
 from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric
 
@@ -39,13 +39,13 @@ class GenerateAnswersResponse(BaseModel):
 _output_instructions_keyphrase_extraction = get_json_format_instructions(
     pydantic_object=ExtractKeyphrasesResponse  # type: ignore
 )
-_output_parser_question_generation = RagasoutputParser(
+_output_parser_question_generation = RagasOutputParserOld(
     pydantic_object=GenerateQuestionsResponse
 )
-_output_parser_answer_generation = RagasoutputParser(
+_output_parser_answer_generation = RagasOutputParserOld(
     pydantic_object=GenerateAnswersResponse
 )
-_output_parser_keyphrase_extraction = RagasoutputParser(
+_output_parser_keyphrase_extraction = RagasOutputParserOld(
     pydantic_object=ExtractKeyphrasesResponse
 )
 
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index b8588b839..81e459b79 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -15,28 +15,23 @@
 from dataclasses import dataclass, field
 from enum import Enum
 
+from pysbd import Segmenter
+
 from ragas.callbacks import new_group
 from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
 from ragas.executor import is_event_loop_running
+from ragas.prompt import PromptMixin
 from ragas.run_config import RunConfig
-from ragas.utils import deprecated
+from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, deprecated
 
 if t.TYPE_CHECKING:
     from langchain_core.callbacks import Callbacks
 
     from ragas.embeddings import BaseRagasEmbeddings
     from ragas.llms import BaseRagasLLM
-
-from pysbd import Segmenter
-from pysbd.languages import LANGUAGE_CODES
-
-from ragas.prompt import PromptMixin
-
 logger = logging.getLogger(__name__)
 
 
-LANGUAGE_CODES = {v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()}
-
 VALID_COLUMNS = [
     "user_input",
     "retrieved_contexts",
@@ -58,8 +53,7 @@ class Metric(ABC):
 
     @property
     @abstractmethod
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
     @property
     def required_columns(self) -> t.Dict[str, t.Set[str]]:
@@ -146,8 +140,7 @@ async def ascore(
         return score
 
     @abstractmethod
-    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
-        ...
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ...
 
 
 @dataclass
@@ -245,8 +238,7 @@ async def _single_turn_ascore(
         self,
         sample: SingleTurnSample,
         callbacks: Callbacks,
-    ) -> float:
-        ...
+    ) -> float: ...
 
 
 class MultiTurnMetric(Metric):
@@ -311,8 +303,7 @@ async def _multi_turn_ascore(
         self,
         sample: MultiTurnSample,
         callbacks: Callbacks,
-    ) -> float:
-        ...
+    ) -> float: ...
 
 
 class Ensember:
@@ -358,12 +349,14 @@ def get_segmenter(
     Get a sentence segmenter for a given language
     """
     language = language.lower()
-    if language not in LANGUAGE_CODES:
+    if language not in RAGAS_SUPPORTED_LANGUAGE_CODES:
         raise ValueError(
-            f"Language '{language}' not supported. Supported languages: {LANGUAGE_CODES.keys()}"
+            f"Language '{language}' not supported. Supported languages: {RAGAS_SUPPORTED_LANGUAGE_CODES.keys()}"
         )
     return Segmenter(
-        language=LANGUAGE_CODES[language], clean=clean, char_span=char_span
+        language=RAGAS_SUPPORTED_LANGUAGE_CODES[language],
+        clean=clean,
+        char_span=char_span,
     )
 
 
diff --git a/src/ragas/prompt.py b/src/ragas/prompt.py
deleted file mode 100644
index 6c3ddd917..000000000
--- a/src/ragas/prompt.py
+++ /dev/null
@@ -1,213 +0,0 @@
-from __future__ import annotations
-
-import inspect
-import typing as t
-from abc import ABC, abstractmethod
-
-import pydantic
-
-# Check Pydantic version
-from pydantic import BaseModel
-
-from ragas.llms.output_parser import RagasoutputParser
-from ragas.llms.prompt import PromptValue
-
-if t.TYPE_CHECKING:
-    from langchain_core.callbacks import Callbacks
-
-    from ragas.llms.base import BaseRagasLLM
-
-PYDANTIC_V2 = pydantic.VERSION.startswith("2.")
-
-
-class BasePrompt(ABC):
-    @abstractmethod
-    async def generate(
-        self,
-        llm: BaseRagasLLM,
-        data: t.Any,
-        n: int = 1,
-        temperature: t.Optional[float] = None,
-        stop: t.Optional[t.List[str]] = None,
-        callbacks: Callbacks = [],
-    ) -> t.Any:
-        pass
-
-
-def model_to_dict(
-    model: BaseModel,
-    by_alias: bool = False,
-    exclude_unset: bool = False,
-    exclude_defaults: bool = False,
-) -> t.Dict[str, t.Any]:
-    if PYDANTIC_V2:
-        return model.model_dump(  # type: ignore
-            by_alias=by_alias,
-            exclude_unset=exclude_unset,
-            exclude_defaults=exclude_defaults,
-        )
-    else:
-        return model.dict(
-            by_alias=by_alias,
-            exclude_unset=exclude_unset,
-            exclude_defaults=exclude_defaults,
-        )
-
-
-def to_json(model: t.Any, indent: int = 4) -> str:
-    if PYDANTIC_V2:
-        # Pydantic 2.x
-        return model.model_dump_json(indent=indent)
-    else:
-        # Pydantic 1.x
-        return model.json(indent=indent)
-
-
-def model_to_json_schema(model: t.Type[BaseModel]) -> str:
-    if PYDANTIC_V2:
-        # NOTE: this is not the same as model.schema_json()
-        return model.model_json_schema()  # type: ignore
-    else:
-        return model.schema_json()
-
-
-InputModel = t.TypeVar("InputModel", bound=BaseModel)
-OutputModel = t.TypeVar("OutputModel", bound=BaseModel)
-
-
-class StringIO(BaseModel):
-    text: str
-
-
-class BoolIO(BaseModel):
-    value: bool
-
-
-class PydanticPrompt(BasePrompt, t.Generic[InputModel, OutputModel]):
-    input_model: t.Type[InputModel]
-    output_model: t.Type[OutputModel]
-    instruction: str
-    examples: t.List[t.Tuple[InputModel, OutputModel]] = []
-
-    def generate_instruction(self) -> str:
-        return self.instruction
-
-    def generate_output_signature(self, indent: int = 4) -> str:
-        schema = model_to_json_schema(self.output_model)
-        return (
-            f"Please return the output in a JSON format that complies with the "
-            f"following schema as specified in JSON Schema and OpenAPI specification:\n"
-            f"{schema}"
-        )
-
-    def generate_examples(self):
-        if self.examples:
-            example_strings = []
-            for e in self.examples:
-                input_data, output_data = e
-                example_strings.append(
-                    self.instruction
-                    + "\n"
-                    + "input: "
-                    + to_json(input_data, indent=4)
-                    + "\n"
-                    + "output: "
-                    + to_json(output_data, indent=4)
-                )
-
-            return (
-                "These are some examples to show how to perform the above instruction\n"
-                + "\n\n".join(example_strings)
-            )
-        # if no examples are provided
-        else:
-            return ""
-
-    def to_string(self, data: InputModel) -> str:
-        # this needs a check
-        return (
-            self.generate_instruction()
-            + "\n"
-            + self.generate_output_signature()
-            + "\n"
-            + self.generate_examples()
-            + "\nNow perform the above instruction with the following input\n"
-            + "input: "
-            + to_json(data, indent=4)
-            + "\n"
-            + "output: "
-        )
-
-    async def generate(
-        self,
-        llm: BaseRagasLLM,
-        data: InputModel,
-        n: int = 1,
-        temperature: t.Optional[float] = None,
-        stop: t.Optional[t.List[str]] = None,
-        callbacks: Callbacks = [],
-    ) -> OutputModel:
-        processed_data = self.process_input(data)
-        prompt_value = PromptValue(prompt_str=self.to_string(processed_data))
-        resp = await llm.generate(
-            prompt_value,
-            n=n,
-            temperature=temperature,
-            stop=stop,
-            callbacks=callbacks,
-        )
-        resp_text = resp.generations[0][0].text
-        parser = RagasoutputParser(pydantic_object=self.output_model)
-        answer = await parser.aparse(resp_text, prompt_value, llm, max_retries=3)
-
-        # TODO: make sure RagasOutputPraser returns the same type as OutputModel
-        return self.process_output(answer, data)  # type: ignore
-
-    def process_input(self, input: InputModel) -> InputModel:
-        return input
-
-    def process_output(self, output: OutputModel, input: InputModel) -> OutputModel:
-        return output
-
-
-class StringPrompt(BasePrompt):
-    async def generate(
-        self,
-        llm: BaseRagasLLM,
-        data: str,
-        n: int = 1,
-        temperature: t.Optional[float] = None,
-        stop: t.Optional[t.List[str]] = None,
-        callbacks: Callbacks = [],
-    ) -> str:
-        prompt_value = PromptValue(prompt_str=data)
-        llm_result = await llm.agenerate_text(
-            prompt_value,
-            n=n,
-            temperature=temperature,
-            stop=stop,
-            callbacks=callbacks,
-        )
-        return llm_result.generations[0][0].text
-
-
-class PromptMixin:
-    def get_prompts(self) -> t.Dict[str, PydanticPrompt]:
-        prompts = {}
-        for name, value in inspect.getmembers(self):
-            if isinstance(value, PydanticPrompt):
-                prompts.update({name: value})
-        return prompts
-
-    def set_prompts(self, **prompts):
-        available_prompts = self.get_prompts()
-        for key, value in prompts.items():
-            if key not in available_prompts:
-                raise ValueError(
-                    f"Prompt with name '{key}' does not exist. Use get_prompts() to see available prompts."
-                )
-            if not isinstance(value, PydanticPrompt):
-                raise ValueError(
-                    f"Prompt with name '{key}' must be an instance of 'ragas.prompt.PydanticPrompt'"
-                )
-            setattr(self, key, value)
diff --git a/src/ragas/prompt/__init__.py b/src/ragas/prompt/__init__.py
new file mode 100644
index 000000000..0dfebf6d0
--- /dev/null
+++ b/src/ragas/prompt/__init__.py
@@ -0,0 +1,11 @@
+from .base import BasePrompt, BoolIO, PydanticPrompt, StringIO, StringPrompt
+from .mixin import PromptMixin
+
+__all__ = [
+    "BasePrompt",
+    "BoolIO",
+    "PydanticPrompt",
+    "StringIO",
+    "StringPrompt",
+    "PromptMixin",
+]
diff --git a/src/ragas/prompt/base.py b/src/ragas/prompt/base.py
new file mode 100644
index 000000000..674acbf70
--- /dev/null
+++ b/src/ragas/prompt/base.py
@@ -0,0 +1,384 @@
+from __future__ import annotations
+
+import logging
+import typing as t
+from abc import ABC, abstractmethod
+
+from langchain_core.exceptions import OutputParserException
+from langchain_core.output_parsers import PydanticOutputParser
+from pydantic import BaseModel
+
+from ragas.callbacks import new_group
+from ragas.exceptions import RagasOutputParserException
+from ragas.llms.prompt import PromptValue
+from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, camel_to_snake
+
+if t.TYPE_CHECKING:
+    from langchain_core.callbacks import Callbacks
+
+    from ragas.llms.base import BaseRagasLLM
+
+logger = logging.getLogger(__name__)
+
+
+class BasePrompt(ABC):
+    def __init__(self, name: t.Optional[str] = None, language: str = "english"):
+        if name is None:
+            self.name = camel_to_snake(self.__class__.__name__)
+
+        if language not in RAGAS_SUPPORTED_LANGUAGE_CODES:
+            raise ValueError(
+                f"Language '{language}' not supported. Supported languages: {RAGAS_SUPPORTED_LANGUAGE_CODES.keys()}"
+            )
+        self.language = language
+
+    @abstractmethod
+    async def generate(
+        self,
+        llm: BaseRagasLLM,
+        data: t.Any,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: Callbacks = [],
+    ) -> t.Any:
+        """
+        Generate a single completion from the prompt.
+        """
+        pass
+
+    @abstractmethod
+    def generate_multiple(
+        self,
+        llm: BaseRagasLLM,
+        data: t.Any,
+        n: int = 1,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: Callbacks = [],
+    ) -> t.Any:
+        """
+        Generate multiple completions from the prompt.
+        """
+        pass
+
+
+InputModel = t.TypeVar("InputModel", bound=BaseModel)
+OutputModel = t.TypeVar("OutputModel", bound=BaseModel)
+
+
+class StringIO(BaseModel):
+    text: str
+
+
+class BoolIO(BaseModel):
+    value: bool
+
+
+class PydanticPrompt(BasePrompt, t.Generic[InputModel, OutputModel]):
+    input_model: t.Type[InputModel]
+    output_model: t.Type[OutputModel]
+    instruction: str
+    examples: t.List[t.Tuple[InputModel, OutputModel]] = []
+
+    def _generate_instruction(self) -> str:
+        return self.instruction
+
+    def _generate_output_signature(self, indent: int = 4) -> str:
+        return (
+            f"Please return the output in a JSON format that complies with the "
+            f"following schema as specified in JSON Schema and OpenAPI specification:\n"
+            f"{self.output_model.model_json_schema()}"
+        )
+
+    def _generate_examples(self):
+        if self.examples:
+            example_strings = []
+            for e in self.examples:
+                input_data, output_data = e
+                example_strings.append(
+                    self.instruction
+                    + "\n"
+                    + "input: "
+                    + input_data.model_dump_json(indent=4)
+                    + "\n"
+                    + "output: "
+                    + output_data.model_dump_json(indent=4)
+                )
+
+            return (
+                "These are some examples to show how to perform the above instruction\n"
+                + "\n\n".join(example_strings)
+            )
+        # if no examples are provided
+        else:
+            return ""
+
+    def to_string(self, data: InputModel) -> str:
+        # this needs a check
+        return (
+            self._generate_instruction()
+            + "\n"
+            + self._generate_output_signature()
+            + "\n"
+            + self._generate_examples()
+            + "\nNow perform the above instruction with the following input\n"
+            + "input: "
+            + data.model_dump_json(indent=4)
+            + "\n"
+            + "output: "
+        )
+
+    async def generate(
+        self,
+        llm: BaseRagasLLM,
+        data: InputModel,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: t.Optional[Callbacks] = None,
+    ) -> OutputModel:
+        """
+        Generate a single output using the provided language model and input data.
+
+        This method is a special case of `generate_multiple` where only one output is generated.
+
+        Parameters
+        ----------
+        llm : BaseRagasLLM
+            The language model to use for generation.
+        data : InputModel
+            The input data for generation.
+        temperature : float, optional
+            The temperature parameter for controlling randomness in generation.
+        stop : List[str], optional
+            A list of stop sequences to end generation.
+        callbacks : Callbacks, optional
+            Callback functions to be called during the generation process.
+
+        Returns
+        -------
+        OutputModel
+            The generated output.
+
+        Notes
+        -----
+        This method internally calls `generate_multiple` with `n=1` and returns the first (and only) result.
+        """
+        callbacks = callbacks or []
+
+        # this is just a special case of generate_multiple
+        output_single = await self.generate_multiple(
+            llm=llm,
+            data=data,
+            n=1,
+            temperature=temperature,
+            stop=stop,
+            callbacks=callbacks,
+        )
+        return output_single[0]
+
+    async def generate_multiple(
+        self,
+        llm: BaseRagasLLM,
+        data: InputModel,
+        n: int = 1,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: t.Optional[Callbacks] = None,
+    ) -> t.List[OutputModel]:
+        """
+        Generate multiple outputs using the provided language model and input data.
+
+        Parameters
+        ----------
+        llm : BaseRagasLLM
+            The language model to use for generation.
+        data : InputModel
+            The input data for generation.
+        n : int, optional
+            The number of outputs to generate. Default is 1.
+        temperature : float, optional
+            The temperature parameter for controlling randomness in generation.
+        stop : List[str], optional
+            A list of stop sequences to end generation.
+        callbacks : Callbacks, optional
+            Callback functions to be called during the generation process.
+
+        Returns
+        -------
+        List[OutputModel]
+            A list of generated outputs.
+
+        Raises
+        ------
+        RagasOutputParserException
+            If there's an error parsing the output.
+        """
+        callbacks = callbacks or []
+        processed_data = self.process_input(data)
+        prompt_rm, prompt_cb = new_group(
+            name=self.name,
+            inputs={"data": processed_data},
+            callbacks=callbacks,
+        )
+        prompt_value = PromptValue(prompt_str=self.to_string(processed_data))
+        resp = await llm.generate(
+            prompt_value,
+            n=n,
+            temperature=temperature,
+            stop=stop,
+            callbacks=prompt_cb,
+        )
+
+        output_models = []
+        parser = RagasOutputParser(pydantic_object=self.output_model)
+        for i in range(n):
+            output_string = resp.generations[0][i].text
+            try:
+                answer = await parser.parse_output_string(
+                    output_string=output_string,
+                    prompt_value=prompt_value,
+                    llm=llm,
+                    callbacks=prompt_cb,
+                    max_retries=3,
+                )
+                processed_output = self.process_output(answer, data)  # type: ignore
+                output_models.append(processed_output)
+            except RagasOutputParserException as e:
+                prompt_rm.on_chain_error(error=e)
+                logger.error("Prompt %s failed to parse output: %s", self.name, e)
+                raise e
+
+        prompt_rm.on_chain_end({"output": output_models})
+        return output_models
+
+    def process_input(self, input: InputModel) -> InputModel:
+        return input
+
+    def process_output(self, output: OutputModel, input: InputModel) -> OutputModel:
+        return output
+
+
+class StringPrompt(BasePrompt):
+    """
+    A simple prompt that can be formatted with additional data using f-string syntax.
+
+    This prompt is a simpler alternative to PydanticPrompt for those who prefer a more
+    flexible approach without the need for a Pydantic model.
+
+    Parameters
+    ----------
+    instruction : str
+        The instruction string that can be formatted with additional data.
+
+    Examples
+    --------
+    >>> from ragas.prompt import string_prompt
+    >>> await prompt.generate(llm=llm, data={"category": "commerce"})
+    """
+
+    async def generate(
+        self,
+        llm: BaseRagasLLM,
+        data: str,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: Callbacks = [],
+    ) -> str:
+        """
+        Generate text based on the instruction and provided data.
+
+        Parameters
+        ----------
+        llm : BaseRagasLLM
+            The language model to use for text generation.
+        data : Optional[Dict[str, Any]], optional
+            The data to format the instruction with, by default None.
+        n : int, optional
+            The number of completions to generate, by default 1.
+        temperature : Optional[float], optional
+            The temperature for text generation, by default None.
+        stop : Optional[List[str]], optional
+            The stop sequences for text generation, by default None.
+        callbacks : Callbacks, optional
+            The callbacks to use during text generation, by default [].
+
+        Returns
+        -------
+        str
+            The generated text.
+        """
+        llm_result = await llm.agenerate_text(
+            PromptValue(prompt_str=data),
+            n=1,
+            temperature=temperature,
+            stop=stop,
+            callbacks=callbacks,
+        )
+        return llm_result.generations[0][0].text
+
+    async def generate_multiple(
+        self,
+        llm: BaseRagasLLM,
+        data: str,
+        n: int = 1,
+        temperature: t.Optional[float] = None,
+        stop: t.Optional[t.List[str]] = None,
+        callbacks: Callbacks = [],
+    ) -> t.List[str]:
+        return [
+            await self.generate(llm, data, temperature, stop, callbacks)
+            for _ in range(n)
+        ]
+
+
+class OutputStringAndPrompt(BaseModel):
+    output_string: str
+    prompt_value: str
+
+
+class FixOutputFormat(PydanticPrompt[OutputStringAndPrompt, StringIO]):
+    instruction = "The output string did not satisfy the constraints given in the prompt. Fix the output string and return it."
+    input_model = OutputStringAndPrompt
+    output_model = StringIO
+
+
+fix_output_format_prompt = FixOutputFormat()
+
+
+class RagasOutputParser(PydanticOutputParser[OutputModel]):
+    async def parse_output_string(
+        self,
+        output_string: str,
+        prompt_value: PromptValue,
+        llm: BaseRagasLLM,
+        callbacks: Callbacks,
+        max_retries: int = 1,
+    ):
+        callbacks = callbacks or []
+        try:
+            result = super().parse(output_string)
+        except OutputParserException:
+            if max_retries != 0:
+                retry_rm, retry_cb = new_group(
+                    name="fix_output_format",
+                    inputs={"output_string": output_string},
+                    callbacks=callbacks,
+                )
+                fixed_output_string = await fix_output_format_prompt.generate(
+                    llm=llm,
+                    data=OutputStringAndPrompt(
+                        output_string=output_string,
+                        prompt_value=prompt_value.to_string(),
+                    ),
+                )
+                retry_rm.on_chain_end({"fixed_output_string": fixed_output_string})
+                return await self.parse_output_string(
+                    output_string=fixed_output_string.text,
+                    prompt_value=prompt_value,
+                    llm=llm,
+                    max_retries=max_retries - 1,
+                    callbacks=callbacks,
+                )
+            else:
+                raise RagasOutputParserException(num_retries=max_retries)
+        return result
diff --git a/src/ragas/prompt/mixin.py b/src/ragas/prompt/mixin.py
new file mode 100644
index 000000000..c577fbbe9
--- /dev/null
+++ b/src/ragas/prompt/mixin.py
@@ -0,0 +1,26 @@
+import inspect
+import typing as t
+
+from .base import PydanticPrompt
+
+
+class PromptMixin:
+    def get_prompts(self) -> t.Dict[str, PydanticPrompt]:
+        prompts = {}
+        for name, value in inspect.getmembers(self):
+            if isinstance(value, PydanticPrompt):
+                prompts.update({name: value})
+        return prompts
+
+    def set_prompts(self, **prompts):
+        available_prompts = self.get_prompts()
+        for key, value in prompts.items():
+            if key not in available_prompts:
+                raise ValueError(
+                    f"Prompt with name '{key}' does not exist. Use get_prompts() to see available prompts."
+                )
+            if not isinstance(value, PydanticPrompt):
+                raise ValueError(
+                    f"Prompt with name '{key}' must be an instance of 'ragas.prompt.PydanticPrompt'"
+                )
+            setattr(self, key, value)
diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py
index 09d73a4a8..39b9bf0a4 100644
--- a/src/ragas/testset/transforms/extractors/llm_based.py
+++ b/src/ragas/testset/transforms/extractors/llm_based.py
@@ -108,7 +108,6 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
     ]
 
 
-
 class NamedEntities(BaseModel):
     ORG: t.List[str]
     LOC: t.List[str]
@@ -141,7 +140,6 @@ class NERPrompt(PydanticPrompt[StringIO, NEROutput]):
     ]
 
 
-      
 @dataclass
 class SummaryExtractor(LLMBasedExtractor):
     property_name: str = "summary"
@@ -207,4 +205,3 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Dict[str, t.List[str]]]:
             return self.property_name, {}
         result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
         return self.property_name, result.entities.model_dump()
-
diff --git a/src/ragas/utils.py b/src/ragas/utils.py
index bca9da871..92a3bc73e 100644
--- a/src/ragas/utils.py
+++ b/src/ragas/utils.py
@@ -2,17 +2,22 @@
 
 import logging
 import os
+import re
 import typing as t
 import warnings
 from functools import lru_cache
 
 import numpy as np
 from datasets import Dataset
+from pysbd.languages import LANGUAGE_CODES
 
 if t.TYPE_CHECKING:
     from ragas.metrics.base import Metric
 
 DEBUG_ENV_VAR = "RAGAS_DEBUG"
+RAGAS_SUPPORTED_LANGUAGE_CODES = {
+    v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()
+}
 
 
 @lru_cache(maxsize=1)
@@ -207,3 +212,12 @@ def convert_v1_to_v2_dataset(dataset: Dataset) -> Dataset:
 def convert_v2_to_v1_dataset(dataset: Dataset) -> Dataset:
     columns_map = {k: v for k, v in REQUIRED_COLS_v1.items() if k in dataset.features}
     return dataset.rename_columns(columns_map)
+
+
+def camel_to_snake(name):
+    """
+    Convert a camelCase string to snake_case.
+    eg: HaiThere -> hai_there
+    """
+    pattern = re.compile(r"(?<!^)(?=[A-Z])")
+    return pattern.sub("_", name).lower()
diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py
index d8d960d15..ea143c9ea 100644
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@@ -43,7 +43,6 @@ async def echo_order(index: int):
 
 
 @pytest.mark.asyncio
-@pytest.mark.skip(reason="This test is not working on CI/CD")
 async def test_executor_with_running_loop():
     import asyncio
 
diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py
index a67018de4..c431262a8 100644
--- a/tests/unit/test_prompt.py
+++ b/tests/unit/test_prompt.py
@@ -2,7 +2,8 @@
 from langchain_core.outputs import Generation, LLMResult
 
 from ragas.llms.base import BaseRagasLLM
-from ragas.prompt import PromptValue, StringIO, StringPrompt
+from ragas.llms.prompt import PromptValue
+from ragas.prompt import StringIO, StringPrompt
 from ragas.run_config import RunConfig
 
 
@@ -29,6 +30,7 @@ async def test_string_prompt():
     echo_llm = EchoLLM(run_config=RunConfig())
     prompt = StringPrompt()
     assert await prompt.generate(data="hello", llm=echo_llm) == "hello"
+    assert prompt.name == "string_prompt"
 
 
 expected_generate_output_signature = """\
@@ -60,7 +62,7 @@ class JokeGenerator(PydanticPrompt[InputModel, StringIO]):
         output_model = StringIO
 
     p = JokeGenerator()
-    _ = p.generate_output_signature()
+    _ = p._generate_output_signature()
 
     # assert expected_generate_output_signature == generation
 
@@ -78,7 +80,7 @@ class Prompt(PydanticPrompt[StringIO, StringIO]):
     assert p.input_model == StringIO
     assert p.output_model == StringIO
 
-    assert p.generate_examples() == ""
+    assert p._generate_examples() == ""
 
 
 def test_pydantic_prompt_examples():
@@ -95,3 +97,15 @@ class Prompt(PydanticPrompt[StringIO, StringIO]):
 
     _ = Prompt()
     # assert p.generate_examples() == "hello -> hello\nworld -> world"
+
+
+def test_prompt_hash():
+    from ragas.prompt import StringPrompt
+
+    class Prompt(StringPrompt):
+        instruction = "You are a helpful assistant."
+
+    p = Prompt()
+    assert hash(p) == hash(p)
+    p.instruction = "You are a helpful assistant. And some more"
+    # assert hash(p) != hash(p)
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index c14554ea6..71e1803b4 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -37,3 +37,17 @@ def test_check_if_sum_is_close(values, close_to, num_places):
 )
 def test_get_from_dict(data_dict, key, expected):
     assert get_from_dict(data_dict, key) == expected
+
+
+@pytest.mark.parametrize(
+    ["camel_case_string", "expected"],
+    [
+        ("myVariableName", "my_variable_name"),
+        ("CamelCaseString", "camel_case_string"),
+        ("AnotherCamelCaseString", "another_camel_case_string"),
+    ],
+)
+def test_camel_to_snake(camel_case_string, expected):
+    from ragas.utils import camel_to_snake
+
+    assert camel_to_snake(camel_case_string) == expected