Merge branch 'ucbepic:main' into main

staru09 · Dec 27, 2024 · 9c0d9da · 9c0d9da
2 parents 7b703d6 + 0e077aa
commit 9c0d9da
Show file tree

Hide file tree

Showing 64 changed files with 5,803 additions and 2,441 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ workloads/*
 *pytest_cache*
 *ruff_cache*
 motion-old*
+venv/
 
 # dependencies
 website/node_modules

diff --git a/docetl/config_wrapper.py b/docetl/config_wrapper.py
@@ -1,7 +1,7 @@
 import datetime
 import os
 from docetl.console import get_console
-from docetl.utils import load_config
+from docetl.utils import decrypt, load_config
 from typing import Any, Dict, List, Optional, Tuple, Union
 from docetl.operations.utils import APIWrapper
 import pyrate_limiter
@@ -71,6 +71,19 @@ def __init__(
             self.console = DOCETL_CONSOLE
         self.max_threads = max_threads or (os.cpu_count() or 1) * 4
         self.status = None
+        encrypted_llm_api_keys = self.config.get("llm_api_keys", {})
+        if encrypted_llm_api_keys:
+            self.llm_api_keys = {
+                key: decrypt(value, os.environ.get("DOCETL_ENCRYPTION_KEY", ""))
+                for key, value in encrypted_llm_api_keys.items()
+            }
+        else:
+            self.llm_api_keys = {}
+
+        # Temporarily set environment variables for API keys
+        self._original_env = os.environ.copy()
+        for key, value in self.llm_api_keys.items():
+            os.environ[key] = value
 
         buckets = {
             param: pyrate_limiter.InMemoryBucket(
@@ -95,3 +108,6 @@ def __init__(
         self.rate_limiter = pyrate_limiter.Limiter(bucket_factory, max_delay=math.inf)
 
         self.api = APIWrapper(self)
+
+    def reset_env(self):
+        os.environ = self._original_env
diff --git a/docetl/operations/__init__.py b/docetl/operations/__init__.py
@@ -1,19 +1,50 @@
 import importlib.metadata
+from docetl.operations.cluster import ClusterOperation
+from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation
+from docetl.operations.equijoin import EquijoinOperation
+from docetl.operations.filter import FilterOperation
+from docetl.operations.gather import GatherOperation
+from docetl.operations.map import MapOperation
+from docetl.operations.reduce import ReduceOperation
+from docetl.operations.resolve import ResolveOperation
+from docetl.operations.split import SplitOperation
+from docetl.operations.sample import SampleOperation
+from docetl.operations.unnest import UnnestOperation
 
 
+mapping = {
+    "cluster": ClusterOperation,
+    "code_filter": CodeFilterOperation,
+    "code_map": CodeMapOperation,
+    "code_reduce": CodeReduceOperation,
+    "equijoin": EquijoinOperation,
+    "filter": FilterOperation,
+    "gather": GatherOperation,
+    "map": MapOperation,
+    "reduce": ReduceOperation,
+    "resolve": ResolveOperation,
+    "split": SplitOperation,
+    "sample": SampleOperation,
+    "unnest": UnnestOperation,
+}
+
 def get_operation(operation_type: str):
     """Loads a single operation by name""" 
     try:
         entrypoint = importlib.metadata.entry_points(group="docetl.operation")[
             operation_type
         ]
-    except KeyError as e:
+        return entrypoint.load()
+    except KeyError:
+        if operation_type in mapping:
+            return mapping[operation_type]
         raise KeyError(f"Unrecognized operation {operation_type}")
-    return entrypoint.load()
 
 def get_operations():
     """Load all available operations and return them as a dictionary"""
-    return {
+    operations = mapping.copy()
+    operations.update({
         op.name: op.load()
         for op in importlib.metadata.entry_points(group="docetl.operation")
-    }
+    })
+    return operations
diff --git a/docetl/operations/utils/api.py b/docetl/operations/utils/api.py
@@ -446,7 +446,7 @@ def _call_llm_with_cache(
         dataset_description = self.runner.config.get("system_prompt", {}).get("dataset_description", "a collection of unstructured documents")
         parethetical_op_instructions = "many inputs:one output" if op_type == "reduce" else "one input:one output"
 
-        system_prompt = f"You are a {persona}, intelligently transforming data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as accurately, precisely, and exhaustively as possible. The result should be a structured output that you will send back to the user."
+        system_prompt = f"You are a {persona}, helping the user make sense of their data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as precisely and exhaustively (i.e., high recall) as possible. The result should be a structured output that you will send back to the user, with the `send_output` function. Do not influence your answers too much based on the `send_output` function parameter names; just use them to send the result back to the user."
         if scratchpad:
             system_prompt += f"""
 

diff --git a/docetl/operations/utils/llm.py b/docetl/operations/utils/llm.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, Optional
 import tiktoken
 from jinja2 import Template
-from litellm import completion, RateLimitError
+from litellm import model_cost
 from pydantic import BaseModel
 from rich import print as rprint
 
@@ -69,7 +69,9 @@ def truncate_messages(
     from_agent: bool = False
 ) -> List[Dict[str, str]]:
     """Truncate messages to fit within model's context length."""
-    model_input_context_length = 8192  # Default
+    model_input_context_length = model_cost.get(model.split("/")[-1], {}).get(
+        "max_input_tokens", 8192
+    )
     total_tokens = sum(count_tokens(json.dumps(msg), model) for msg in messages)
 
     if total_tokens <= model_input_context_length - 100:

diff --git a/docetl/operations/utils/validation.py b/docetl/operations/utils/validation.py
@@ -103,6 +103,10 @@ def convert_val(value: Any, model: str = "gpt-4o-mini") -> Dict[str, Any]:
         if "gemini" not in model:
             result["additionalProperties"] = False
         return result
+    elif value.startswith("enum[") and value.endswith("]"):
+        enum_values = value[5:-1].strip().split(",")
+        enum_values = [v.strip() for v in enum_values]
+        return {"type": "string", "enum": enum_values}
     else:
         raise ValueError(f"Unsupported value type: {value}")
 

diff --git a/docetl/optimizers/map_optimizer/config_generators.py b/docetl/optimizers/map_optimizer/config_generators.py
@@ -84,7 +84,7 @@ def _get_split_config(
 
         Determine the split key and subprompt for processing chunks of the input data.
         The split key should be a key in the input data that contains a string to be split.
-        The subprompt should be designed to process individual chunks of the split data.
+        The subprompt should be designed to process individual chunks of the split data, and only process the main chunk in within chunk delimiters if they are present.
         Note that the subprompt's output schema might be different from the original operation's output schema, since you may want to extract more information or make the information less structured/more free text. The original output schema will be preserved when combining the chunks' processed results.
 
         Important:
@@ -148,6 +148,8 @@ def _get_split_config(
 
         result["subprompt_output_schema"].update(op_config["output"]["schema"])
 
+        result["subprompt"] = result["subprompt"] + " Only process the main chunk in --- Begin Main Chunk --- and --- End Main Chunk --- delimiters if they are present."
+
         self.console.log(
             f"[yellow]Breaking down operation {op_config['name']}[/yellow]"
         )

diff --git a/docetl/optimizers/map_optimizer/plan_generators.py b/docetl/optimizers/map_optimizer/plan_generators.py
@@ -218,7 +218,7 @@ def determine_metadata_with_retry():
         map_op = self.operation_creator.create_map_operation(
             op_config,
             subprompt_output_schema,
-            split_result["subprompt"] + " Only process the main chunk.",
+            split_result["subprompt"] ,
         )
 
         # unnest_ops = self.operation_creator.create_unnest_operations(op_config)

diff --git a/docetl/runner.py b/docetl/runner.py
@@ -299,8 +299,9 @@ def save(self, data: List[Dict]):
 
                 with open(output_config["path"], "w", newline="") as file:
                     writer = csv.DictWriter(file, fieldnames=data[0].keys())
+                    limited_data = [{k: d.get(k, None) for k in data[0].keys()} for d in data]
                     writer.writeheader()
-                    writer.writerows(data)
+                    writer.writerows(limited_data)
             self.console.print(
                 f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
             )

diff --git a/docetl/utils.py b/docetl/utils.py
@@ -7,6 +7,36 @@
 from jinja2 import Environment, meta
 from litellm import completion_cost as lcc
 
+from lzstring import LZString
+
+class Decryptor:
+    def __init__(self, secret_key: str):
+        self.key = secret_key
+        self.lz = LZString()
+
+    def decrypt(self, encrypted_data: str) -> str:
+        try:
+            # First decompress the data
+            compressed = self.lz.decompressFromBase64(encrypted_data)
+            if not compressed:
+                raise ValueError("Invalid compressed data")
+
+            # Then decode using the key
+            result = ''
+            for i in range(len(compressed)):
+                char_code = ord(compressed[i]) - ord(self.key[i % len(self.key)])
+                result += chr(char_code)
+
+            return result
+
+        except Exception as e:
+            print(f"Decryption failed: {str(e)}")
+            return None
+
+def decrypt(encrypted_data: str, secret_key: str) -> str:
+    if not secret_key:
+        return encrypted_data
+    return Decryptor(secret_key).decrypt(encrypted_data)
 
 class StageType(Enum):
     SAMPLE_RUN = "sample_run"

diff --git a/docs/concepts/operators.md b/docs/concepts/operators.md
@@ -88,14 +88,15 @@ prompt: |
 
 ## Output Schema
 
-The `output` attribute defines the structure of the LLM's response. It supports various data types:
+The `output` attribute defines the structure of the LLM's response. It supports various data types (see [schemas](../concepts/schemas.md) for more details):
 
 - `string` (or `str`, `text`, `varchar`): For text data
 - `integer` (or `int`): For whole numbers
 - `number` (or `float`, `decimal`): For decimal numbers
 - `boolean` (or `bool`): For true/false values
 - `list`: For arrays or sequences of items
 - objects: Using notation `{field: type}`
+- `enum`: For a set of possible values
 
 Example:
 

diff --git a/docs/concepts/schemas.md b/docs/concepts/schemas.md
@@ -22,6 +22,7 @@ Schemas are defined in the `output` section of an operator. They support various
 | `integer` | `int`                    | For whole numbers                                            |
 | `number`  | `float`, `decimal`       | For decimal numbers                                          |
 | `boolean` | `bool`                   | For true/false values                                        |
+| `enum`    | -                        | For a set of possible values                                |
 | `list`    | -                        | For arrays or sequences of items (must specify element type) |
 | Objects   | -                        | Using notation `{field: type}`                               |
 
@@ -72,6 +73,24 @@ Objects are defined using curly braces and must have typed fields:
 
     Make sure that you put the type in quotation marks, if it references an object type (i.e., has curly braces)! Otherwise the yaml won't compile!
 
+## Enum Types
+
+You can also specify enum types, which will be validated against a set of possible values. Suppose we have an operation to extract sentiments from a document, and we want to ensure that the sentiment is one of the three possible values. Our schema would look like this:
+
+```yaml
+output:
+  schema:
+    sentiment: "enum[positive, negative, neutral]"
+```
+
+You can also specify a list of enum types (say, if we wanted to extract _multiple_ sentiments from a document):
+
+```yaml
+output:
+  schema:
+    possible_sentiments: "list[enum[positive, negative, neutral]]"
+```
+
 ## Structured Outputs and Tool API
 
 DocETL uses structured outputs or tool API to enforce schema typing. This ensures that the LLM outputs adhere to the specified schema, making the results more consistent and easier to process in subsequent operations.

diff --git a/docs/playground/index.md b/docs/playground/index.md
@@ -12,6 +12,14 @@ Building complex LLM pipelines for your data often requires experimentation and
 - ✨ Refine operations based on sample outputs  
 - 🔄 Build complex pipelines step-by-step
 
+## Public Playground
+
+You can access our hosted playground at [docetl.org/playground](https://docetl.org/playground). You'll need to provide your own LLM API keys to use the service. The chatbot and prompt engineering assistants are powered by OpenAI models, so you'll need to provide an OpenAI API key.
+
+!!! note "Data Storage Notice"
+
+    As this is a research project, we cache results and store data on our servers to improve the system. While we will never sell or release your data, if you have privacy concerns, we recommend running the playground locally using the installation instructions below.
+
 ## Installation
 
 There are two ways to run the playground:
@@ -24,7 +32,9 @@ The easiest way to get started is using Docker:
 
 Create `.env` in the root directory (for the FastAPI backend):
 ```bash
-OPENAI_API_KEY=your_api_key_here # Or your LLM provider's API key
+# Required: API key for your preferred LLM provider (OpenAI, Anthropic, etc)
+# The key format will depend on your chosen provider (sk-..., anthro-...)
+OPENAI_API_KEY=your_api_key_here 
 BACKEND_ALLOW_ORIGINS=
 BACKEND_HOST=localhost
 BACKEND_PORT=8000
@@ -35,9 +45,11 @@ FRONTEND_PORT=3000
 
 Create `.env.local` in the `website` directory (for the frontend) **note that this must be in the `website` directory**:
 ```bash
-OPENAI_API_KEY=sk-xxx # For the AI assistant in the interface
-OPENAI_API_BASE=https://api.openai.com/v1 # For the AI assistant in the interface
-MODEL_NAME=gpt-4o-mini # For the AI assistant in the interface
+# Optional: These are only needed if you want to use the AI assistant chatbot 
+# and prompt engineering tools. Must be OpenAI API keys specifically.
+OPENAI_API_KEY=sk-xxx
+OPENAI_API_BASE=https://api.openai.com/v1
+MODEL_NAME=gpt-4o-mini
 
 NEXT_PUBLIC_BACKEND_HOST=localhost
 NEXT_PUBLIC_BACKEND_PORT=8000
@@ -72,7 +84,7 @@ cd docetl
 
 2. Set up environment variables in `.env` in the root directory:
 ```bash
-OPENAI_API_KEY=your_api_key_here
+LLM_API_KEY=your_api_key_here
 BACKEND_ALLOW_ORIGINS=
 BACKEND_HOST=localhost
 BACKEND_PORT=8000
@@ -113,6 +125,15 @@ The UI offers an optional chat-based assistant that can help you iteratively dev
 
 To use the assistant, you need to set your OpenAI API key in the `.env.local` file in the website directory. You can get an API key [here](https://platform.openai.com/api-keys). The API key should be in the following format: `sk-proj-...`. We only support the openai models for the assistant.
 
+!!! tip "Self-hosting with UI API key management"
+
+    If you want to host your own version of DocETL for your organization while allowing users to set their API keys through the UI, you'll need to set up encryption. Add the following to both `.env` and `website/.env.local`:
+    ```bash
+    DOCETL_ENCRYPTION_KEY=your_secret_key_here
+    ```
+    This shared encryption key allows API keys to be securely encrypted when sent to your server. Make sure to use the same value in both files.
+
+
 ## Complex Tutorial
 
 See this [YouTube video](https://www.youtube.com/watch?v=IlgueVqtHGo) for a more in depth tutorial on how to use the playground.