Merge branch 'stanfordnlp:main' into main

stanfordnlp · Mar 26, 2024 · fe3e2d0 · fe3e2d0
2 parents d195871 + c8375fc
commit fe3e2d0
Show file tree

Hide file tree

Showing 13 changed files with 598 additions and 459 deletions.
diff --git a/docs/docs/building-blocks/1-language_models.md b/docs/docs/building-blocks/1-language_models.md
@@ -18,10 +18,6 @@ For example, to use OpenAI language models, you can do it as follows.
 gpt3_turbo = dspy.OpenAI(model='gpt-3.5-turbo-1106', max_tokens=300)
 dspy.configure(lm=gpt3_turbo)
 ```
-**Output:**
-```text
-['Hello! How can I assist you today?']
-```
 
 ## Directly calling the LM.
 
@@ -31,11 +27,16 @@ You can simply call the LM with a string to give it a raw prompt, i.e. a string.
 gpt3_turbo("hello! this is a raw prompt to GPT-3.5")
 ```
 
+**Output:**
+```text
+['Hello! How can I assist you today?']
+```
+
 This is almost never the recommended way to interact with LMs in DSPy, but it is allowed.
 
 ## Using the LM with DSPy signatures.
 
-You can also use the LM via DSPy [signatures] and [modules], which we discuss in more depth in the remaining guides.
+You can also use the LM via DSPy [`signature` (input/output spec)](https://dspy-docs.vercel.app/docs/building-blocks/signatures) and [`modules`](https://dspy-docs.vercel.app/docs/building-blocks/modules), which we discuss in more depth in the remaining guides.
 
 ```python
 # Define a module (ChainOfThought) and assign it a signature (return an answer, given a question).
@@ -172,4 +173,4 @@ model = 'dist/prebuilt/mlc-chat-Llama-2-7b-chat-hf-q4f16_1'
 model_path = 'dist/prebuilt/lib/Llama-2-7b-chat-hf-q4f16_1-cuda.so'
 
 llama = dspy.ChatModuleClient(model=model, model_path=model_path)
-```
+```
diff --git a/dsp/modules/azure_openai.py b/dsp/modules/azure_openai.py
@@ -1,14 +1,6 @@
-import logging
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(message)s",
-    handlers=[logging.FileHandler("azure_openai_usage.log")],
-)
-
 import functools
 import json
+import logging
 from typing import Any, Literal, Optional, cast
 
 import backoff

diff --git a/dsp/modules/databricks.py b/dsp/modules/databricks.py
@@ -1,14 +1,3 @@
-import logging
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(message)s',
-    handlers=[
-        logging.FileHandler('openai_usage.log'),
-    ],
-)
-
 import functools
 import json
 from typing import Literal, Optional

diff --git a/dsp/modules/gpt3.py b/dsp/modules/gpt3.py
@@ -1,14 +1,6 @@
-import logging
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(message)s",
-    handlers=[logging.FileHandler("openai_usage.log")],
-)
-
 import functools
 import json
+import logging
 from typing import Any, Literal, Optional, cast
 
 import backoff

diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -8,6 +8,7 @@
 
 # Functional must be imported after primitives, predict and signatures
 from .functional import * # isort: skip
+from .utils.logging import logger, set_log_level, set_log_output
 
 settings = dsp.settings
 

diff --git a/dspy/datasets/dataloader.py b/dspy/datasets/dataloader.py
@@ -62,6 +62,13 @@ def from_json(self, file_path:str, fields: List[str] = None, input_keys: Tuple[s
 
         return [dspy.Example({field:row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
 
+    def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
+        dataset = load_dataset("parquet", data_files=file_path)["train"]
+
+        if not fields:
+            fields = list(dataset.features)
+
+        return [dspy.Example({field: row[field] for field in fields}).with_inputs(input_keys) for row in dataset]
 
     def sample(
         self,

diff --git a/dspy/predict/program_of_thought.py b/dspy/predict/program_of_thought.py
@@ -8,20 +8,22 @@
 
 
 class ProgramOfThought(Module):
-    def __init__(self, signature, max_iters=3):
+    def __init__(self, signature, max_iters=3, import_white_list=None):
         super().__init__()
         self.signature = signature = ensure_signature(signature)
         self.max_iters = max_iters
+        self.import_white_list = import_white_list
 
         self.input_fields = signature.input_fields
         self.output_fields = signature.output_fields
 
+        assert len(self.output_fields) == 1, "PoT only supports one output field."
+
+        self.output_field_name = next(iter(self.output_fields))
         inputs_ = ", ".join(
             [f"`{field_name}`" for field_name in self.input_fields.keys()],
         )
-        outputs_ = ", ".join(
-            [f"`{field_name}`" for field_name in self.output_fields.keys()],
-        )
+        outputs_ = f"`{self.output_field_name}`"
 
         assert len(self.output_fields) == 1, "PoT only supports one output field."
 
@@ -55,7 +57,6 @@ def __init__(self, signature, max_iters=3):
                 self._generate_instruction("answer"),
             ),
         )
-
     def _generate_signature(self, mode):
         signature_dict = dict(self.input_fields)
         fields_for_mode = {
@@ -92,7 +93,7 @@ def _generate_signature(self, mode):
                     prefix="Code Output:",
                     desc="output of previously-generated python code",
                 ),
-                "answer": self.signature.fields["answer"],
+                self.output_field_name: self.signature.fields[self.output_field_name],
             },
         }
         signature_dict.update(fields_for_mode[mode])
@@ -105,12 +106,7 @@ def _generate_instruction(self, mode):
                 for field_name in self._generate_signature(mode).input_fields
             ],
         )
-        mode_outputs = ", ".join(
-            [
-                f"`{field_name}`"
-                for field_name in self._generate_signature(mode).output_fields
-            ],
-        )
+        mode_outputs = f"`{self.output_field_name}`"
         if mode == "generate":
             instr = [
                 f"You will be given {mode_inputs} and you will respond with {mode_outputs}.",
@@ -120,7 +116,7 @@ def _generate_instruction(self, mode):
         elif mode == "regenerate":
             instr = [
                 f"You are given {mode_inputs} due to an error in previous code.",
-                f"Your task is to correct the error and provide the new {mode_outputs}.",
+                "Your task is to correct the error and provide the new `generated_code`.",
             ]
         else:  # mode == 'answer'
             instr = [
@@ -129,6 +125,7 @@ def _generate_instruction(self, mode):
 
         return "\n".join(instr)
 
+
     def parse_code(self, code_data):
         code = (
             code_data.get("generated_code", "").split("---", 1)[0].split("\n\n\n", 1)[0]
@@ -156,32 +153,33 @@ def execute_code(self, code):
         if not code:
             return code, None, "Error: Empty code before execution."
         code_prompt = CodePrompt(code, code_type="python")
-        interpreter = PythonInterpreter(action_space={"print": print})
+        interpreter = PythonInterpreter(action_space={"print": print}, import_white_list=self.import_white_list)
         try:
             output = str(code_prompt.execute(interpreter=interpreter)[0])
+            print
             return code, output, None
         except Exception as e:
             return code, None, str(e)
-
     def forward(self, **kwargs):
-        code_data = self.code_generate(question=kwargs["question"])
+        input_kwargs = {
+            field_name: kwargs[field_name] for field_name in self.input_fields
+        }
+        code_data = self.code_generate(**input_kwargs)
         parsed_code, error = self.parse_code(code_data)
         # FIXME: Don't try to execute the code if it didn't parse
         code, output, error = self.execute_code(parsed_code)
         hop = 0
         while hop < self.max_iters and error:
             print("Error in code execution")
-            code_data = self.code_regenerate(
-                question=kwargs["question"], previous_code=code, error=error,
-            )
+            input_kwargs.update({"previous_code": code, "error": error})
+            code_data = self.code_regenerate(**input_kwargs)
             parsed_code, error = self.parse_code(code_data)
             # FIXME: Don't try to execute the code if it didn't parse
             code, output, error = self.execute_code(parsed_code)
             hop += 1
             if hop == self.max_iters:
                 print("Max hops reached. Error persists.")
                 return None
-        answer_gen_result = self.generate_answer(
-            question=kwargs["question"], final_generated_code=code, code_output=output,
-        )
-        return answer_gen_result
+        input_kwargs.update({"final_generated_code": code, "code_output": output})
+        answer_gen_result = self.generate_answer(**input_kwargs)
+        return answer_gen_result
diff --git a/dspy/retrieve/weaviate_rm.py b/dspy/retrieve/weaviate_rm.py
@@ -5,6 +5,8 @@
 
 try:
     import weaviate
+    import weaviate.classes as wvc
+    from weaviate.collections.classes.grpc import HybridFusion
 except ImportError:
     raise ImportError(
         "The 'weaviate' extra is required to use WeaviateRM. Install it with `pip install dspy-ai[weaviate]`",
@@ -22,6 +24,9 @@ class WeaviateRM(dspy.Retrieve):
         weaviate_collection_name (str): The name of the Weaviate collection.
         weaviate_client (WeaviateClient): An instance of the Weaviate client.
         k (int, optional): The default number of top passages to retrieve. Defaults to 3.
+        weaviate_collection_text_key (str, optional): The key in the collection with the content. Defaults to content.
+        weaviate_alpha (float, optional): The alpha value for the hybrid query. Defaults to 0.5.
+        weaviate_fusion_type (wvc.HybridFusion, optional): The fusion type for the query. Defaults to RELATIVE_SCORE.
 
     Examples:
         Below is a code snippet that shows how to use Weaviate as the default retriver:
@@ -44,16 +49,20 @@ class WeaviateRM(dspy.Retrieve):
 
     def __init__(self, 
                  weaviate_collection_name: str, 
-                 weaviate_client: weaviate.Client, 
+                 weaviate_client: weaviate.WeaviateClient,
                  k: int = 3,
                  weaviate_collection_text_key: Optional[str] = "content",
+                 weaviate_alpha: Optional[float] = 0.5,
+                 weaviate_fusion_type: Optional[HybridFusion] = HybridFusion.RELATIVE_SCORE,
         ):
         self._weaviate_collection_name = weaviate_collection_name
         self._weaviate_client = weaviate_client
         self._weaviate_collection_text_key = weaviate_collection_text_key
+        self._weaviate_alpha = weaviate_alpha
+        self._weaviate_fusion_type = weaviate_fusion_type
         super().__init__(k=k)
 
-    def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int]) -> dspy.Prediction:
+    def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None) -> dspy.Prediction:
         """Search with Weaviate for self.k top passages for query
 
         Args:
@@ -72,14 +81,17 @@ def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int]) ->
         queries = [q for q in queries if q]
         passages = []
         for query in queries:
-            results = self._weaviate_client.query\
-                .get(self._weaviate_collection_name, [self._weaviate_collection_text_key])\
-                .with_hybrid(query=query)\
-                .with_limit(k)\
-                .do()
+            collection = self._weaviate_client.collections.get(self._weaviate_collection_name)
+            results = collection.query.hybrid(query=query,
+                                              limit=k,
+                                              alpha=self._weaviate_alpha,
+                                              fusion_type=self._weaviate_fusion_type,
+                                              return_metadata=wvc.query.MetadataQuery(
+                                                  distance=True, score=True),
+                                              )
 
-            results = results["data"]["Get"][self._weaviate_collection_name]
-            parsed_results = [result[self._weaviate_collection_text_key] for result in results]
+            parsed_results = [result.properties[self._weaviate_collection_text_key] for result in results.objects]
             passages.extend(dotdict({"long_text": d}) for d in parsed_results)
 
+        # Return type not changed, needs to be a Prediction object. But other code will break if we change it.
         return passages
diff --git a/dspy/utils/__init__.py b/dspy/utils/__init__.py
@@ -1 +1,2 @@
 from .dummies import *
+from .logging import *
diff --git a/dspy/utils/logging.py b/dspy/utils/logging.py
@@ -0,0 +1,103 @@
+
+import logging
+import os
+import sys
+import typing as t
+
+import structlog
+
+logger = structlog.get_logger()
+
+class LogSettings:
+    def __init__(self, level: str, output_type: str, method: str, file_name: t.Optional[str]) -> None:
+        self.level = level
+        self.output_type = output_type
+        self.method = method
+        self.file_name = file_name
+        self._configure_structlog()
+
+    def _configure_structlog(self):
+
+        if self.output_type == "str":
+            renderer = structlog.dev.ConsoleRenderer()
+        else:
+            renderer = structlog.processors.JSONRenderer()
+
+        structlog.configure(
+            processors=[
+                structlog.stdlib.add_logger_name,
+                structlog.stdlib.add_log_level,
+                structlog.processors.CallsiteParameterAdder(
+                    {
+                        structlog.processors.CallsiteParameter.FILENAME,
+                        structlog.processors.CallsiteParameter.LINENO,
+                    },
+                ),
+                structlog.processors.TimeStamper(fmt="iso"),
+                structlog.processors.StackInfoRenderer(),
+                structlog.processors.format_exc_info,
+                structlog.processors.UnicodeDecoder(),
+                renderer,
+            ],
+            logger_factory=structlog.stdlib.LoggerFactory(),
+            wrapper_class=structlog.stdlib.BoundLogger,
+        )
+
+    def set_log_level(self, level: str) -> None:
+        """Set the logging level."""
+
+        level = level.upper()
+        if level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
+            raise ValueError("log level provider ({level}) is not one of DEBUG, INFO, WARNING, ERROR, CRITICAL")
+
+        self.level = level
+
+        log_level = getattr(logging, level)
+        logger.setLevel(log_level)
+
+    def set_log_output(self, method: t.Optional[str] = None, file_name: t.Optional[str] = None, output_type: t.Optional[str] = None):
+
+        if method is not None and method not in ["console", "file"]:
+            raise ValueError("method provided can only be 'console', 'file'")
+
+        if method == "file" and file_name is None:
+            raise ValueError("file_name must be provided when method = 'file'")
+
+        if method is not None:
+            self.method = method
+            self.file_name = file_name
+
+        if output_type is not None and output_type not in ["str", "json"]:
+            raise ValueError("output_type provided can only be 'str', 'json'")
+
+        if output_type is not None:
+            self.output_type = output_type
+
+        # Update Renderer
+        self._configure_structlog()
+
+        # Grab the root logger
+        log = logging.getLogger()
+        for handler in log.handlers[:]:
+            log.removeHandler(handler)
+
+        # Add new Handler
+        if self.method == "file":
+            assert self.file_name is not None
+            log.addHandler(logging.FileHandler(self.file_name))
+        else:
+            log.addHandler(logging.StreamHandler(sys.stdout))
+
+
+level = os.environ.get("log_level", "info").upper()
+
+# Set Defaults
+logging.basicConfig(
+    format="%(message)s",
+    stream=sys.stdout,
+    level=level,
+)
+
+settings = LogSettings(level=level, output_type="str", method="console", file_name=None)
+set_log_level = settings.set_log_level
+set_log_output = settings.set_log_output