Skip to content

Commit

Permalink
Merge branch 'ucbepic:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
staru09 authored Dec 27, 2024
2 parents 7b703d6 + 0e077aa commit 9c0d9da
Show file tree
Hide file tree
Showing 64 changed files with 5,803 additions and 2,441 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ workloads/*
*pytest_cache*
*ruff_cache*
motion-old*
venv/

# dependencies
website/node_modules
Expand Down
18 changes: 17 additions & 1 deletion docetl/config_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import os
from docetl.console import get_console
from docetl.utils import load_config
from docetl.utils import decrypt, load_config
from typing import Any, Dict, List, Optional, Tuple, Union
from docetl.operations.utils import APIWrapper
import pyrate_limiter
Expand Down Expand Up @@ -71,6 +71,19 @@ def __init__(
self.console = DOCETL_CONSOLE
self.max_threads = max_threads or (os.cpu_count() or 1) * 4
self.status = None
encrypted_llm_api_keys = self.config.get("llm_api_keys", {})
if encrypted_llm_api_keys:
self.llm_api_keys = {
key: decrypt(value, os.environ.get("DOCETL_ENCRYPTION_KEY", ""))
for key, value in encrypted_llm_api_keys.items()
}
else:
self.llm_api_keys = {}

# Temporarily set environment variables for API keys
self._original_env = os.environ.copy()
for key, value in self.llm_api_keys.items():
os.environ[key] = value

buckets = {
param: pyrate_limiter.InMemoryBucket(
Expand All @@ -95,3 +108,6 @@ def __init__(
self.rate_limiter = pyrate_limiter.Limiter(bucket_factory, max_delay=math.inf)

self.api = APIWrapper(self)

def reset_env(self):
os.environ = self._original_env
39 changes: 35 additions & 4 deletions docetl/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,50 @@
import importlib.metadata
from docetl.operations.cluster import ClusterOperation
from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation
from docetl.operations.equijoin import EquijoinOperation
from docetl.operations.filter import FilterOperation
from docetl.operations.gather import GatherOperation
from docetl.operations.map import MapOperation
from docetl.operations.reduce import ReduceOperation
from docetl.operations.resolve import ResolveOperation
from docetl.operations.split import SplitOperation
from docetl.operations.sample import SampleOperation
from docetl.operations.unnest import UnnestOperation


mapping = {
"cluster": ClusterOperation,
"code_filter": CodeFilterOperation,
"code_map": CodeMapOperation,
"code_reduce": CodeReduceOperation,
"equijoin": EquijoinOperation,
"filter": FilterOperation,
"gather": GatherOperation,
"map": MapOperation,
"reduce": ReduceOperation,
"resolve": ResolveOperation,
"split": SplitOperation,
"sample": SampleOperation,
"unnest": UnnestOperation,
}

def get_operation(operation_type: str):
"""Loads a single operation by name"""
try:
entrypoint = importlib.metadata.entry_points(group="docetl.operation")[
operation_type
]
except KeyError as e:
return entrypoint.load()
except KeyError:
if operation_type in mapping:
return mapping[operation_type]
raise KeyError(f"Unrecognized operation {operation_type}")
return entrypoint.load()

def get_operations():
"""Load all available operations and return them as a dictionary"""
return {
operations = mapping.copy()
operations.update({
op.name: op.load()
for op in importlib.metadata.entry_points(group="docetl.operation")
}
})
return operations
2 changes: 1 addition & 1 deletion docetl/operations/utils/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def _call_llm_with_cache(
dataset_description = self.runner.config.get("system_prompt", {}).get("dataset_description", "a collection of unstructured documents")
parethetical_op_instructions = "many inputs:one output" if op_type == "reduce" else "one input:one output"

system_prompt = f"You are a {persona}, intelligently transforming data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as accurately, precisely, and exhaustively as possible. The result should be a structured output that you will send back to the user."
system_prompt = f"You are a {persona}, helping the user make sense of their data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as precisely and exhaustively (i.e., high recall) as possible. The result should be a structured output that you will send back to the user, with the `send_output` function. Do not influence your answers too much based on the `send_output` function parameter names; just use them to send the result back to the user."
if scratchpad:
system_prompt += f"""
Expand Down
6 changes: 4 additions & 2 deletions docetl/operations/utils/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, Optional
import tiktoken
from jinja2 import Template
from litellm import completion, RateLimitError
from litellm import model_cost
from pydantic import BaseModel
from rich import print as rprint

Expand Down Expand Up @@ -69,7 +69,9 @@ def truncate_messages(
from_agent: bool = False
) -> List[Dict[str, str]]:
"""Truncate messages to fit within model's context length."""
model_input_context_length = 8192 # Default
model_input_context_length = model_cost.get(model.split("/")[-1], {}).get(
"max_input_tokens", 8192
)
total_tokens = sum(count_tokens(json.dumps(msg), model) for msg in messages)

if total_tokens <= model_input_context_length - 100:
Expand Down
4 changes: 4 additions & 0 deletions docetl/operations/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ def convert_val(value: Any, model: str = "gpt-4o-mini") -> Dict[str, Any]:
if "gemini" not in model:
result["additionalProperties"] = False
return result
elif value.startswith("enum[") and value.endswith("]"):
enum_values = value[5:-1].strip().split(",")
enum_values = [v.strip() for v in enum_values]
return {"type": "string", "enum": enum_values}
else:
raise ValueError(f"Unsupported value type: {value}")

Expand Down
4 changes: 3 additions & 1 deletion docetl/optimizers/map_optimizer/config_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _get_split_config(
Determine the split key and subprompt for processing chunks of the input data.
The split key should be a key in the input data that contains a string to be split.
The subprompt should be designed to process individual chunks of the split data.
The subprompt should be designed to process individual chunks of the split data, and only process the main chunk in within chunk delimiters if they are present.
Note that the subprompt's output schema might be different from the original operation's output schema, since you may want to extract more information or make the information less structured/more free text. The original output schema will be preserved when combining the chunks' processed results.
Important:
Expand Down Expand Up @@ -148,6 +148,8 @@ def _get_split_config(

result["subprompt_output_schema"].update(op_config["output"]["schema"])

result["subprompt"] = result["subprompt"] + " Only process the main chunk in --- Begin Main Chunk --- and --- End Main Chunk --- delimiters if they are present."

self.console.log(
f"[yellow]Breaking down operation {op_config['name']}[/yellow]"
)
Expand Down
2 changes: 1 addition & 1 deletion docetl/optimizers/map_optimizer/plan_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def determine_metadata_with_retry():
map_op = self.operation_creator.create_map_operation(
op_config,
subprompt_output_schema,
split_result["subprompt"] + " Only process the main chunk.",
split_result["subprompt"] ,
)

# unnest_ops = self.operation_creator.create_unnest_operations(op_config)
Expand Down
3 changes: 2 additions & 1 deletion docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,9 @@ def save(self, data: List[Dict]):

with open(output_config["path"], "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=data[0].keys())
limited_data = [{k: d.get(k, None) for k in data[0].keys()} for d in data]
writer.writeheader()
writer.writerows(data)
writer.writerows(limited_data)
self.console.print(
f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
)
Expand Down
30 changes: 30 additions & 0 deletions docetl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,36 @@
from jinja2 import Environment, meta
from litellm import completion_cost as lcc

from lzstring import LZString

class Decryptor:
def __init__(self, secret_key: str):
self.key = secret_key
self.lz = LZString()

def decrypt(self, encrypted_data: str) -> str:
try:
# First decompress the data
compressed = self.lz.decompressFromBase64(encrypted_data)
if not compressed:
raise ValueError("Invalid compressed data")

# Then decode using the key
result = ''
for i in range(len(compressed)):
char_code = ord(compressed[i]) - ord(self.key[i % len(self.key)])
result += chr(char_code)

return result

except Exception as e:
print(f"Decryption failed: {str(e)}")
return None

def decrypt(encrypted_data: str, secret_key: str) -> str:
if not secret_key:
return encrypted_data
return Decryptor(secret_key).decrypt(encrypted_data)

class StageType(Enum):
SAMPLE_RUN = "sample_run"
Expand Down
3 changes: 2 additions & 1 deletion docs/concepts/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,15 @@ prompt: |
## Output Schema
The `output` attribute defines the structure of the LLM's response. It supports various data types:
The `output` attribute defines the structure of the LLM's response. It supports various data types (see [schemas](../concepts/schemas.md) for more details):

- `string` (or `str`, `text`, `varchar`): For text data
- `integer` (or `int`): For whole numbers
- `number` (or `float`, `decimal`): For decimal numbers
- `boolean` (or `bool`): For true/false values
- `list`: For arrays or sequences of items
- objects: Using notation `{field: type}`
- `enum`: For a set of possible values

Example:

Expand Down
19 changes: 19 additions & 0 deletions docs/concepts/schemas.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Schemas are defined in the `output` section of an operator. They support various
| `integer` | `int` | For whole numbers |
| `number` | `float`, `decimal` | For decimal numbers |
| `boolean` | `bool` | For true/false values |
| `enum` | - | For a set of possible values |
| `list` | - | For arrays or sequences of items (must specify element type) |
| Objects | - | Using notation `{field: type}` |

Expand Down Expand Up @@ -72,6 +73,24 @@ Objects are defined using curly braces and must have typed fields:

Make sure that you put the type in quotation marks, if it references an object type (i.e., has curly braces)! Otherwise the yaml won't compile!

## Enum Types

You can also specify enum types, which will be validated against a set of possible values. Suppose we have an operation to extract sentiments from a document, and we want to ensure that the sentiment is one of the three possible values. Our schema would look like this:

```yaml
output:
schema:
sentiment: "enum[positive, negative, neutral]"
```

You can also specify a list of enum types (say, if we wanted to extract _multiple_ sentiments from a document):

```yaml
output:
schema:
possible_sentiments: "list[enum[positive, negative, neutral]]"
```

## Structured Outputs and Tool API

DocETL uses structured outputs or tool API to enforce schema typing. This ensures that the LLM outputs adhere to the specified schema, making the results more consistent and easier to process in subsequent operations.
Expand Down
31 changes: 26 additions & 5 deletions docs/playground/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ Building complex LLM pipelines for your data often requires experimentation and
- ✨ Refine operations based on sample outputs
- 🔄 Build complex pipelines step-by-step

## Public Playground

You can access our hosted playground at [docetl.org/playground](https://docetl.org/playground). You'll need to provide your own LLM API keys to use the service. The chatbot and prompt engineering assistants are powered by OpenAI models, so you'll need to provide an OpenAI API key.

!!! note "Data Storage Notice"

As this is a research project, we cache results and store data on our servers to improve the system. While we will never sell or release your data, if you have privacy concerns, we recommend running the playground locally using the installation instructions below.

## Installation

There are two ways to run the playground:
Expand All @@ -24,7 +32,9 @@ The easiest way to get started is using Docker:

Create `.env` in the root directory (for the FastAPI backend):
```bash
OPENAI_API_KEY=your_api_key_here # Or your LLM provider's API key
# Required: API key for your preferred LLM provider (OpenAI, Anthropic, etc)
# The key format will depend on your chosen provider (sk-..., anthro-...)
OPENAI_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=localhost
BACKEND_PORT=8000
Expand All @@ -35,9 +45,11 @@ FRONTEND_PORT=3000

Create `.env.local` in the `website` directory (for the frontend) **note that this must be in the `website` directory**:
```bash
OPENAI_API_KEY=sk-xxx # For the AI assistant in the interface
OPENAI_API_BASE=https://api.openai.com/v1 # For the AI assistant in the interface
MODEL_NAME=gpt-4o-mini # For the AI assistant in the interface
# Optional: These are only needed if you want to use the AI assistant chatbot
# and prompt engineering tools. Must be OpenAI API keys specifically.
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
MODEL_NAME=gpt-4o-mini

NEXT_PUBLIC_BACKEND_HOST=localhost
NEXT_PUBLIC_BACKEND_PORT=8000
Expand Down Expand Up @@ -72,7 +84,7 @@ cd docetl

2. Set up environment variables in `.env` in the root directory:
```bash
OPENAI_API_KEY=your_api_key_here
LLM_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=localhost
BACKEND_PORT=8000
Expand Down Expand Up @@ -113,6 +125,15 @@ The UI offers an optional chat-based assistant that can help you iteratively dev

To use the assistant, you need to set your OpenAI API key in the `.env.local` file in the website directory. You can get an API key [here](https://platform.openai.com/api-keys). The API key should be in the following format: `sk-proj-...`. We only support the openai models for the assistant.

!!! tip "Self-hosting with UI API key management"

If you want to host your own version of DocETL for your organization while allowing users to set their API keys through the UI, you'll need to set up encryption. Add the following to both `.env` and `website/.env.local`:
```bash
DOCETL_ENCRYPTION_KEY=your_secret_key_here
```
This shared encryption key allows API keys to be securely encrypted when sent to your server. Make sure to use the same value in both files.


## Complex Tutorial

See this [YouTube video](https://www.youtube.com/watch?v=IlgueVqtHGo) for a more in depth tutorial on how to use the playground.
Loading

0 comments on commit 9c0d9da

Please sign in to comment.