diff --git a/dspy/experimental/synthesizer.py b/dspy/experimental/synthesizer.py deleted file mode 100644 index a66bf4d2b..000000000 --- a/dspy/experimental/synthesizer.py +++ /dev/null @@ -1,183 +0,0 @@ -import random -from typing import List - -from datasets import Dataset -from tqdm import tqdm, trange - -import dspy - - -def format_examples(examples: List[dspy.Example]): - if isinstance(examples, str): - return examples - - formatted_example = "" - - for example in examples: - input_keys = example.inputs().keys() - label_keys = example.labels().keys() - - formatted_example += "Inputs:\n" - for key in input_keys: - formatted_example += f"{key}: {example[key]}\n" - - formatted_example += "Outputs:\n" - for key in label_keys: - formatted_example += f"{key}: {example[key]}\n" - - return formatted_example - -class ExplainTask(dspy.Signature): - """Analyze the provided set of datapoints carefully, and prepare a concise, comprehensible summary that captures the essence and purpose of the task these datapoints aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions.""" - - examples = dspy.InputField( - prefix="Examples Datapoints:-", - desc="List of datapoints to analyze and explain the task.", - format=format_examples, - ) - explanation = dspy.OutputField( - prefix="Task Description:", - desc="Explanation of the task.", - ) - -class GenerateFieldDescription(dspy.Signature): - """Generate a concise and informative description for a given field based on the provided name and task description. This description should be no longer than 10 words and should be in simple english.""" - - task_description = dspy.InputField( - prefix="Task Description:", - desc="Description of the task the field is an input to.", - ) - field_name = dspy.InputField( - prefix="Field Name:", - desc="Name of the field to generate synthetic data for.", - ) - field_description = dspy.OutputField( - prefix="Field Description:", - desc="Description of the field.", - ) - -class GenerateInputFieldsData(dspy.Signature): - """Generate synthetic data based on the task description and the given knowledge seed.""" - - knowledge_seed = dspy.InputField( - prefix="Knowledge Seed:", - desc="Seed for the knowledge base search to base the inputs around.", - format=lambda x: str(x), - ) - task_description = dspy.InputField( - prefix="Task Description:", - desc="Description of the task the field is an input to.", - ) - -class GenerateOutputFieldsData(dspy.Signature): - pass - -class Synthesizer: - def __init__(self): - self.explain_task = dspy.Predict(ExplainTask) - self.generate_field_description = dspy.Predict(GenerateFieldDescription) - - self.generate_input_data = GenerateInputFieldsData - self.generate_output_data = GenerateOutputFieldsData - - def _prepare_synthetic_data_predictors(self, input_keys: List[str], output_keys: List[str], task_description: str): - for key in tqdm(input_keys, desc="Preparing Input Fields"): - field_details = self.generate_field_description( - task_description=task_description, - field_name=key, - ) - - field_name = key - field_description = field_details.field_description - - output_field = dspy.OutputField( - prefix=f"{field_name}:", - desc=field_description, - ) - self.generate_input_data = self.generate_input_data.insert( - -1, - field_name, - output_field, - ) - - input_field = dspy.InputField( - prefix=f"{field_name}:", - desc=field_description, - ) - self.generate_output_data = self.generate_output_data.insert( - -1, - field_name, - input_field, - ) - - for key in tqdm(output_keys, desc="Preparing Output Fields"): - field_details = self.generate_field_description( - task_description=task_description, - field_name=key, - ) - - field_name = key - field_description = field_details.field_description - - output_field = dspy.OutputField( - prefix=f"{field_name}:", - desc=field_description, - ) - self.generate_output_data = self.generate_output_data.insert( - -1, - field_name, - output_field, - ) - - return dspy.ChainOfThought(self.generate_input_data), dspy.Predict(self.generate_output_data) - - def generate(self, examples: List[dspy.Example], num_data: int, task_description: str = None, input_keys: str = None, output_keys: str = None) -> List[dspy.Example]: - task_description = task_description or self.explain_task(examples=examples).explanation - self.generate_output_data.__doc__ = task_description - - input_keys = input_keys or [key for key in examples[0].inputs()] - output_keys = output_keys or [key for key in examples[0].labels()] - - self.input_predictor, self.output_predictor = self._prepare_synthetic_data_predictors( - input_keys=input_keys, - output_keys=output_keys, - task_description=task_description, - ) - - data = [] - - for idx in trange(num_data, desc="Generating Synthetic Data"): - inputs = self.input_predictor(task_description=task_description, knowledge_seed=random.randint(0, 1000000), config=dict(temperature=0.7+0.01*idx)) - - input_kwargs = { - key: getattr(inputs, key) - for key in input_keys - } - - outputs = self.output_predictor(**input_kwargs, config=dict(temperature=0.7+0.01*idx)) - - output_kwargs = { - key: getattr(outputs, key) - for key in output_keys - } - - data.append(dspy.Example(**input_kwargs, **output_kwargs).with_inputs(*input_keys)) - - return data - - - def export(self, data: List[dspy.Example], path: str, mode: str = None, **kwargs): - extention = mode or path.split(".")[-1] - - dataset = Dataset.from_list( - [example.toDict() for example in data], - ) - - if extention == "csv": - dataset.to_csv(path_or_buf=path, **kwargs) - - elif extention == "json": - dataset.to_json(path_or_buf=path, **kwargs) - - elif extention == "arrow" or extention == "hf": - dataset.save_to_disk(path) \ No newline at end of file diff --git a/dspy/experimental/synthesizer/__init__.py b/dspy/experimental/synthesizer/__init__.py new file mode 100644 index 000000000..44d2c775d --- /dev/null +++ b/dspy/experimental/synthesizer/__init__.py @@ -0,0 +1 @@ +from .synthesizer import * \ No newline at end of file diff --git a/dspy/experimental/synthesizer/config.py b/dspy/experimental/synthesizer/config.py new file mode 100644 index 000000000..557299cfa --- /dev/null +++ b/dspy/experimental/synthesizer/config.py @@ -0,0 +1,24 @@ +from typing import Any, Optional + +from pydantic import BaseModel, model_validator + + +class SynthesizerArguments(BaseModel): + feedback_mode: Optional[str] = None + num_example_for_feedback: Optional[int] = None + + input_lm_model: Optional[Any] = None + output_lm_model: Optional[Any] = None + output_teacher_module: Optional[Any] = None + + num_example_for_optim: Optional[int] = None + + @model_validator(mode='after') + def validate_feedback_mode(self): + if self.feedback_mode and self.feedback_mode not in ["human", "llm"]: + raise ValueError("Feedback mode should be either 'human' or 'llm'.") + + if self.feedback_mode and not self.num_example_for_feedback: + raise ValueError("Number of examples for feedback is required when feedback mode is provided.") + + return self \ No newline at end of file diff --git a/dspy/experimental/synthesizer/instruction_suffixes.py b/dspy/experimental/synthesizer/instruction_suffixes.py new file mode 100644 index 000000000..53404a2a6 --- /dev/null +++ b/dspy/experimental/synthesizer/instruction_suffixes.py @@ -0,0 +1,3 @@ +INPUT_GENERATION_TASK_WITH_EXAMPLES_SUFFIX = """\n\nI'll also be providing you some data I generated before hand, make sure the data you generate if consistent with task I provided but different from the data I provided in every way possible.""" + +INPUT_GENERATION_TASK_WITH_FEEDBACK_SUFFIX = "\n\nAdditionally, I'll be providing you with feedback on the data you generate, while generating the data make sure to take into account the feedback I provide and try to improve the data you generate based on the feedback I provide." \ No newline at end of file diff --git a/dspy/experimental/synthesizer/signatures.py b/dspy/experimental/synthesizer/signatures.py new file mode 100644 index 000000000..e1a50c689 --- /dev/null +++ b/dspy/experimental/synthesizer/signatures.py @@ -0,0 +1,96 @@ +import dspy + +from .utils import format_examples + + +class UnderstandTask(dspy.Signature): + """I'll be providing you a task description. Your task is to prepare a concise, comprehensible summary that captures the broad essence and purpose of this task description. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances or specifics of individual datapoints, models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise without touching on methodologies or solutions.""" + + task_description = dspy.InputField( + prefix="Task Description:", + desc="Description of the task.", + ) + explanation = dspy.OutputField( + prefix="Task Description:", + desc="Explanation of the task.", + ) + +class ExplainTask(dspy.Signature): + """Analyze the provided set of datapoints carefully, and prepare a concise, comprehensible summary that captures the broad essence and purpose of the task these datapoints aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions.""" + + examples = dspy.InputField( + prefix="Examples Datapoints:", + desc="List of datapoints to analyze and explain the task.", + format=format_examples, + ) + explanation = dspy.OutputField( + prefix="Task Description:", + desc="Explanation of the task.", + ) + +class UpdateTaskDescriptionBasedOnFeedback(dspy.Signature): + """Update the task description based on the feedback provided. Ensure that the revised task description incorporates the feedback to improve its overall clarity and effectiveness. Focus on enhancing the task's goal and basic premise, without delving into specific data points, models, examples, algorithms, or technical intricacies. Your explanation should aim to clarify the task's fundamental objective and purpose.""" + + task_description = dspy.InputField( + prefix="Task Description:", + desc="Description of the task.", + ) + feedback = dspy.InputField( + prefix="Feedback:", + desc="Feedback on the task description.", + ) + updated_task_description = dspy.OutputField( + prefix="Task Description:", + desc="Updated description of the task.", + ) + +class GetFeedbackOnGeneration(dspy.Signature): + """Provide constructive feedback on the synthetic data generated, focusing on its quality, relevance, and diversity. Highlight any areas that require improvement and offer suggestions for enhancement. The feedback should center on the overall effectiveness of the synthetic data in aligning with the task description and knowledge seed. Avoid delving into specific data points, models, examples, algorithms, or technical intricacies. Your feedback should be critical but constructive, aiming to improve the synthetic data and the task description.""" + + synthetic_data = dspy.InputField( + prefix="Synthetic Data:", + desc="Synthetic data generated.", + format=format_examples, + ) + task_description = dspy.InputField( + prefix="Task Description:", + desc="Description of the task the synthetic data is aligned with.", + ) + feedback = dspy.OutputField( + prefix="Feedback:", + desc="Feedback on the synthetic data.", + ) + +class GenerateFieldDescription(dspy.Signature): + """Generate a concise and informative description for a given field based on the provided name and task description. This description should be no longer than 10 words and should be in simple english.""" + + task_description = dspy.InputField( + prefix="Task Description:", + desc="Description of the task the field is an input to.", + ) + field_name = dspy.InputField( + prefix="Field Name:", + desc="Name of the field to generate synthetic data for.", + ) + field_description = dspy.OutputField( + prefix="Field Description:", + desc="Description of the field.", + ) + +class GenerateInputFieldsData(dspy.Signature): + """Create synthetic data using the task description and the provided knowledge seed. Your task is to generate diverse and imaginative data that aligns with the given task description and knowledge seed. You are encouraged to be creative and not limit yourself, allowing for a wide range of synthetic data that reflects the characteristics and details provided in the task description. The data should be unique and varied, showcasing originality and creativity while maintaining relevance to the task and knowledge seed. + +A knowledge seed is the index of the knowledge base you have, each index represents a different knowledge base.""" + + knowledge_seed = dspy.InputField( + prefix="Knowledge Seed:", + desc="Seed for the knowledge base search to base the inputs around.", + format=lambda x: str(x), + ) + task_description = dspy.InputField( + prefix="Task Description:", + desc="Description of the task the field is an input to.", + ) + +class GenerateOutputFieldsData(dspy.Signature): + pass \ No newline at end of file diff --git a/dspy/experimental/synthesizer/synthesizer.py b/dspy/experimental/synthesizer/synthesizer.py new file mode 100644 index 000000000..0672a2808 --- /dev/null +++ b/dspy/experimental/synthesizer/synthesizer.py @@ -0,0 +1,258 @@ +import random +from collections.abc import Mapping +from typing import List, Optional, Union + +from datasets import Dataset +from rich import print as rprint +from tqdm import tqdm, trange + +import dspy + +from .config import SynthesizerArguments +from .instruction_suffixes import ( + INPUT_GENERATION_TASK_WITH_EXAMPLES_SUFFIX, + INPUT_GENERATION_TASK_WITH_FEEDBACK_SUFFIX, +) +from .signatures import ( + ExplainTask, + GenerateFieldDescription, + GenerateInputFieldsData, + GenerateOutputFieldsData, + GetFeedbackOnGeneration, + UnderstandTask, + UpdateTaskDescriptionBasedOnFeedback, +) +from .utils import format_examples + +__all__ = [ + "Synthesizer", + "SynthesizerArguments", +] + +class Synthesizer: + def __init__(self, config: SynthesizerArguments): + self.config = config + self.input_lm = config.input_lm_model or dspy.settings.lm + self.output_lm = config.output_lm_model or dspy.settings.lm + + self.explain_task = dspy.Predict(ExplainTask) + self.understand_task = dspy.Predict(UnderstandTask) + self.get_feedback_on_generation = dspy.Predict(GetFeedbackOnGeneration) + self.generate_field_description = dspy.Predict(GenerateFieldDescription) + self.update_task_description = dspy.Predict(UpdateTaskDescriptionBasedOnFeedback) + + self.generate_input_data = GenerateInputFieldsData + self.generate_output_data = GenerateOutputFieldsData + + def _gather_feedback(self, examples: dspy.Example) -> str: + if self.config.feedback_mode == "human": + input_keys = examples.inputs().keys() + + print("-"*75) + print_text = "[bold blue]Generated Data:[bold blue]\n[bold red]Inputs:[bold red]\n" + + for key in input_keys: + print_text += f"\t[bold yellow]{key}[bold yellow]: [green]{examples[key]}[green]\n" + + rprint(print_text) + feedback = input("Provide feedback on the generated data: ") + print("-"*75) + + return feedback + + elif self.config.feedback_mode == "llm": + feedback = self.get_feedback_on_generation( + synthetic_data=[examples], + task_description=self.generate_output_data.__doc__, + ) + + return feedback.feedback + + else: + raise ValueError("Feedback mode should be either 'human' or 'llm'.") + + def _get_field_data(self, key: str, keys_dict: Mapping[str, str]): + if key.startswith("$"): + field_details = self.generate_field_description( + task_description=keys_dict["task_description"], + field_name=key, + ) + + field_name = key + field_description = field_details.field_description + + return field_name, field_description + + else: + field_name = key + field_description = keys_dict[key] + + return field_name, field_description + + def _prepare_synthetic_data_predictors( + self, + input_keys: Mapping[str, str], + output_keys: Mapping[str, str], + ground_source: Optional[Union[List[dspy.Example], dspy.Signature]] = None, + ): + for key in tqdm(input_keys, desc="Preparing Input Fields"): + field_name, field_description = self._get_field_data(key, input_keys) + + output_field = dspy.OutputField( + prefix=f"{field_name}:", + desc=field_description, + ) + self.generate_input_data = self.generate_input_data.insert( + -1, + field_name, + output_field, + ) + + if ground_source: + self.generate_input_data = self.generate_input_data.insert( + -1, + "ground_source", + dspy.InputField( + prefix="Pre-Generated Examples:", + desc="Pre-Generated Examples to differ the inputs around.", + format=format_examples, + ), + ) + + input_field = dspy.InputField( + prefix=f"{field_name}:", + desc=field_description, + ) + self.generate_output_data = self.generate_output_data.insert( + -1, + field_name, + input_field, + ) + + for key in tqdm(output_keys, desc="Preparing Output Fields"): + field_name, field_description = self._get_field_data(key, output_keys) + + output_field = dspy.OutputField( + prefix=f"{field_name}:", + desc=field_description, + ) + self.generate_output_data = self.generate_output_data.insert( + -1, + field_name, + output_field, + ) + + return dspy.ChainOfThought(self.generate_input_data), dspy.Predict(self.generate_output_data) + + def _get_dataset_metadata(self, ground_source: Union[List[dspy.Example], dspy.Signature]): + if isinstance(ground_source, dspy.SignatureMeta): + task_description = ground_source.__doc__ + if task_description.startswith("Given the fields"): + task_description = self.understand_task(examples=ground_source.__doc__).explanation + + input_keys = {k:v.json_schema_extra["desc"] for k,v in ground_source.input_fields.items()} + output_keys = {k:v.json_schema_extra["desc"] for k,v in ground_source.output_fields.items()} + + return task_description, input_keys, output_keys + + elif isinstance(ground_source, list) and isinstance(ground_source[0], dspy.Example): + task_description = self.explain_task(examples=ground_source).explanation + input_keys = {key:f"${{{key}}}" for key in ground_source[0].inputs()} + output_keys = {key:f"${{{key}}}" for key in ground_source[0].labels()} + + return task_description, input_keys, output_keys + + else: + raise ValueError("Ground source must be either a list of examples or a signature.") + + def generate( + self, + ground_source: Union[List[dspy.Example], dspy.Signature], + num_data: int, + batch_size: int = 1, + ): + batch_size = batch_size or 1 + task_description, input_keys, output_keys = self._get_dataset_metadata(ground_source) + + if self.config.num_example_for_optim: + self.generate_input_data.__doc__ += INPUT_GENERATION_TASK_WITH_EXAMPLES_SUFFIX + + if self.config.feedback_mode: + self.generate_input_data.__doc__ += INPUT_GENERATION_TASK_WITH_FEEDBACK_SUFFIX + + self.generate_output_data.__doc__ = task_description + + self.input_predictor, self.output_predictor = self._prepare_synthetic_data_predictors( + input_keys=input_keys, + output_keys=output_keys, + ground_source=ground_source if self.config.num_example_for_optim else None, + ) + + data = [] + feedback = "" + + for idx in trange(0, num_data, batch_size, desc="Generating Synthetic Data"): + iter_temperature = 0.7+0.01*idx + iter_seed = random.randint(0, 1000000) + + kwargs = { + "task_description": task_description, + "knowledge_seed": iter_seed, + "config": dict(temperature=iter_temperature, n=batch_size), + } + + if self.config.num_example_for_optim: + kwargs["ground_source"] = random.sample(ground_source, self.config.num_example_for_optim) + + with dspy.context(lm=self.input_lm): + inputs = self.input_predictor(**kwargs) + + input_kwargs = [{ + key: getattr(completions, key) + for key in input_keys + } for completions in inputs.completions] + + for kwargs in input_kwargs: + outputs = None + + with dspy.context(lm=self.output_lm, temperature=iter_temperature): + if self.config.output_teacher_module: + outputs = self.config.output_teacher_module(**kwargs) + + else: + outputs = self.output_predictor(**kwargs, config=dict(temperature=iter_temperature)) + + output_kwargs = { + key: getattr(outputs, key) + for key in output_keys + } + + data.append(dspy.Example(**kwargs, **output_kwargs).with_inputs(*input_keys)) + + if self.config.feedback_mode and idx < self.config.num_example_for_feedback: + feedback = self._gather_feedback(data[-1]) + + task_description = self.update_task_description( + task_description=task_description, + feedback=feedback, + ).updated_task_description + + self.output_predictor.signature.__doc__ = task_description + + return data + + def export(self, data: List[dspy.Example], path: str, mode: str = None, **kwargs): + extention = mode or path.split(".")[-1] + + dataset = Dataset.from_list( + [example.toDict() for example in data], + ) + + if extention == "csv": + dataset.to_csv(path_or_buf=path, **kwargs) + + elif extention == "json": + dataset.to_json(path_or_buf=path, **kwargs) + + elif extention == "arrow" or extention == "hf": + dataset.save_to_disk(path) diff --git a/dspy/experimental/synthesizer/utils.py b/dspy/experimental/synthesizer/utils.py new file mode 100644 index 000000000..f08b142e1 --- /dev/null +++ b/dspy/experimental/synthesizer/utils.py @@ -0,0 +1,24 @@ +from typing import List + +import dspy + + +def format_examples(examples: List[dspy.Example]) -> str: + if isinstance(examples, str): + return examples + + formatted_example = "" + + for example in examples: + input_keys = example.inputs().keys() + label_keys = example.labels().keys() + + formatted_example += "Inputs:\n" + for key in input_keys: + formatted_example += f"{key}: {example[key]}\n" + + formatted_example += "Outputs:\n" + for key in label_keys: + formatted_example += f"{key}: {example[key]}\n" + + return formatted_example \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 7747a524e..1cb7b06f1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2103,7 +2103,7 @@ testing = ["coverage", "pyyaml"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "markdown-it-py-2.2.0.tar.gz", hash = "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"}, @@ -2245,7 +2245,7 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, @@ -4305,6 +4305,24 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "rich" +version = "13.7.1" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "rpds-py" version = "0.18.0" @@ -6228,4 +6246,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "f7a5ab7c85e79920d41e45e9bbd17f0dbc1180c52d027235a656c270d9e79346" \ No newline at end of file +content-hash = "f7a5ab7c85e79920d41e45e9bbd17f0dbc1180c52d027235a656c270d9e79346" diff --git a/pyproject.toml b/pyproject.toml index d9ff8ccc6..3356bd951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,6 +108,7 @@ sphinx_rtd_theme = { version = "*", optional = true } autodoc_pydantic = { version = "*", optional = true } sphinx-reredirects = { version = "^0.1.2", optional = true } sphinx-automodapi = { version = "0.16.0", optional = true } +rich = "^13.7.1" [tool.poetry.group.dev.dependencies]