contrib: OpenAI code generation (#675)

* code generation chain added * reviews incorporated * dag.png updated --------- Co-authored-by: zilto <tjean@DESKTOP-V6JDCS2>
DAGWorks-Inc · Feb 2, 2024 · da7c676 · da7c676
1 parent 232176a
commit da7c676
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 0 deletions.
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/README.md b/contrib/hamilton/contrib/user/zilto/llm_generate_code/README.md
@@ -0,0 +1,37 @@
+# Purpose of this module
+
+This module uses the OpenAI completion API to generate code.
+
+For any language, you can request `generated_code` to get the generated response. If you are generating Python code, you can execute it in a subprocess by requesting `execution_output` and `execution_error`.
+
+## Example
+```python
+from hamilton import driver
+import __init__ as llm_generate_code
+
+dr = driver.Builder().with_modules(llm_generate_code).build()
+
+dr.execute(
+    ["execution_output", "execution_error"],
+    inputs=dict(
+        query="Retrieve the primary type from a `typing.Annotated` object`",
+    )
+)
+```
+
+## Configuration Options
+### Config.when
+This module doesn't receive configurations.
+
+### Inputs
+- `query`: The query for which you want code generated.
+- `api_key`: Set the OpenAI API key to use. If None, read the environment variable `OPENAI_API_KEY`
+- `code_language`: Set the code language to generate the reponse in. Defaults to `python`
+
+### Overrides
+- `prompt_template_to_generate_code`: Create a new prompt template with the fields `query` and `code_language`.
+- `prompt_to_generate_code`: Manually provide a prompt to generate Python code
+
+## Extension / Limitations
+- Executing arbitrary generated code is a security risk. Proceed with caution.
+- You need to manually install dependencies for your generated code to be executed (i.e., you need to `pip install pandas` yourself)
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/__init__.py b/contrib/hamilton/contrib/user/zilto/llm_generate_code/__init__.py
@@ -0,0 +1,127 @@
+import logging
+import os
+import subprocess
+from typing import Optional
+
+from hamilton.function_modifiers import extract_fields
+
+logger = logging.getLogger(__name__)
+
+from hamilton import contrib
+
+with contrib.catch_import_errors(__name__, __file__, logger):
+    import openai
+
+
+def llm_client(api_key: Optional[str] = None) -> openai.OpenAI:
+    """Create an OpenAI client."""
+    if api_key is None:
+        api_key = os.environ.get("OPENAI_API_KEY")
+
+    return openai.OpenAI(api_key=api_key)
+
+
+def prompt_template_to_generate_code() -> str:
+    """Prompt template to generate code.
+
+    It must include the fields `code_language` and `query`.
+    """
+    return """Write some {code_language} code to solve the user's problem.
+
+Return only python code in Markdown format, e.g.:
+
+```{code_language}
+....
+```
+
+user problem
+{query}
+
+{code_language} code
+"""
+
+
+def prompt_to_generate_code(
+    prompt_template_to_generate_code: str, query: str, code_language: str = "python"
+) -> str:
+    """Fill the prompt template with the code language and the user query."""
+    return prompt_template_to_generate_code.format(
+        query=query,
+        code_language=code_language,
+    )
+
+
+def response_generated_code(llm_client: openai.OpenAI, prompt_to_generate_code: str) -> str:
+    """Call the OpenAI API completion endpoint with the prompt to generate code."""
+    response = llm_client.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        prompt=prompt_to_generate_code,
+    )
+    return response.choices[0].text
+
+
+def parsed_generated_code(response_generated_code: str, code_language: str = "python") -> str:
+    """Retrieve the code section from the generated text."""
+    _, _, lower_part = response_generated_code.partition(f"```{code_language}")
+    code_part, _, _ = lower_part.partition("```")
+    return code_part
+
+
+def code_prepared_for_execution(parsed_generated_code: str, code_language: str = "python") -> str:
+    """If code is Python, append to it statements prepare it to be run in a subprocess.
+
+    We collect all local variables in a directory and filter out Python builtins to keep
+    only the variables from the generated code. print() is used to send string data from
+    the subprocess back to the parent proceess via its `stdout`.
+    """
+
+    if code_language != "python":
+        raise ValueError("Can only execute the generated code if `code_language` = 'python'")
+
+    code_to_get_vars = (
+        "excluded_vars = { 'excluded_vars', '__builtins__', '__annotations__'} | set(dir(__builtins__))\n"
+        "local_vars = {k:v for k,v in locals().items() if k not in excluded_vars}\n"
+        "print(local_vars)"
+    )
+
+    return parsed_generated_code + code_to_get_vars
+
+
+@extract_fields(
+    dict(
+        execution_output=str,
+        execution_error=str,
+    )
+)
+def executed_output(code_prepared_for_execution: str) -> dict:
+    """Execute the generated Python code + appended utilities in a subprocess.
+
+    The output and errors from the code are collected as strings. Executing
+    the code in a subprocess provides isolation, but isn't a security guarantee.
+    """
+    process = subprocess.Popen(
+        ["python", "-c", code_prepared_for_execution],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    output, errors = process.communicate()
+    return dict(execution_output=output, execution_error=errors)
+
+
+# run as a script to test dataflow
+if __name__ == "__main__":
+    import __init__ as llm_generate_code
+
+    from hamilton import driver
+
+    dr = driver.Builder().with_modules(llm_generate_code).build()
+
+    dr.display_all_functions("dag.png", orient="TB")
+
+    res = dr.execute(
+        ["execution_output", "execution_error"],
+        overrides=dict(generated_code="s = 'hello world'"),
+    )
+
+    print(res)
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/dag.png b/contrib/hamilton/contrib/user/zilto/llm_generate_code/dag.png
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/requirements.txt b/contrib/hamilton/contrib/user/zilto/llm_generate_code/requirements.txt
@@ -0,0 +1 @@
+openai
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/tags.json b/contrib/hamilton/contrib/user/zilto/llm_generate_code/tags.json
@@ -0,0 +1,5 @@
+{
+  "schema": "1.0",
+  "use_case_tags": ["LLM", "OpenAI", "code generation"],
+  "secondary_tags": {}
+}
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/valid_configs.jsonl b/contrib/hamilton/contrib/user/zilto/llm_generate_code/valid_configs.jsonl