Merge branch 'develop' into feature/revised-programming-feedback

# Conflicts: # llm_core/llm_core/utils/llm_utils.py # modules/modeling/module_modeling_llm/poetry.lock # modules/modeling/module_modeling_llm/pyproject.toml # modules/programming/module_programming_llm/module_programming_llm/config.py # modules/programming/module_programming_llm/module_programming_llm/generate_graded_suggestions_by_file.py # modules/programming/module_programming_llm/module_programming_llm/generate_non_graded_suggestions_by_file.py # modules/programming/module_programming_llm/module_programming_llm/generate_summary_by_file.py # modules/programming/module_programming_llm/module_programming_llm/helpers/models/model_config.py # modules/programming/module_programming_llm/module_programming_llm/helpers/models/openai.py # modules/programming/module_programming_llm/module_programming_llm/helpers/models/replicate.py # modules/programming/module_programming_llm/module_programming_llm/split_grading_instructions_by_file.py # modules/programming/module_programming_llm/module_programming_llm/split_problem_statement_by_file.py # modules/programming/module_programming_llm/poetry.lock # modules/programming/module_programming_llm/pyproject.toml # modules/text/module_text_llm/poetry.lock # modules/text/module_text_llm/pyproject.toml
ls1intum · Oct 29, 2024 · 139cc19 · 139cc19
2 parents 132214c + 9c49cc8
commit 139cc19
Show file tree

Hide file tree

Showing 45 changed files with 4,177 additions and 2,262 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -56,6 +56,13 @@ jobs:
           docker build -t athena .
           cd ..
 
+      - name: Build llm_core image
+        id: set-image-core_llm
+        run: |
+          cd ./llm_core
+          docker build -t llm_core .
+          cd ..
+
       - name: Docker Login
         id: docker-login
         run: |

diff --git a/assessment_module_manager/assessment_module_manager/module/request_to_module.py b/assessment_module_manager/assessment_module_manager/module/request_to_module.py
@@ -33,6 +33,7 @@ async def find_module_by_name(module_name: str) -> Optional[Module]:
     return None
 
 
+# pylint: disable=too-many-positional-arguments
 async def request_to_module(module: Module, headers: dict, path: str, lms_url: str, data: Optional[dict], method: str) -> ModuleResponse:
     """
     Helper function to send a request to a module.

diff --git a/assessment_module_manager/poetry.lock b/assessment_module_manager/poetry.lock
diff --git a/athena-docker.sh b/athena-docker.sh
@@ -40,7 +40,7 @@ function download_cofee_config {
   mkdir -p ./module_text_cofee
   for file in traefik.docker.yml node_config.docker.yml; do
     echo "  Downloading $file..."
-    curl -sSL -o ./module_text_cofee/$file https://raw.githubusercontent.com/ls1intum/Athena/"$pr_branch"/module_text_cofee/"$file"
+    curl -sSL -o ./module_text_cofee/$file https://raw.githubusercontent.com/ls1intum/Athena/"$pr_branch"/modules/text/module_text_cofee/"$file"
   done
 }
 
@@ -146,4 +146,4 @@ case "$subcommand" in
         general_help
         exit 1
         ;;
-esac
+esac
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -8,6 +8,13 @@ services:
     image: athena
     command: echo "Athena build succeeded, exiting (this is normal)"
 
+  llm_core:
+      build: ./llm_core
+      depends_on:
+        - athena
+      image: llm_core
+      command: echo "llm_core build succeeded, exiting (this is normal)"
+
   assessment_module_manager:
     build: ./assessment_module_manager
     depends_on:
@@ -30,6 +37,7 @@ services:
     build: modules/programming/module_programming_llm
     depends_on:
       - athena
+      - llm_core
     ports:
       - "5002:5002"
 
@@ -38,6 +46,7 @@ services:
     build: modules/text/module_text_llm
     depends_on:
       - athena
+      - llm_core
     ports:
       - "5003:5003"
 
@@ -70,5 +79,6 @@ services:
     build: modules/modeling/module_modeling_llm
     depends_on:
       - athena
+      - llm_core
     ports:
       - "5008:5008"
diff --git a/llm_core/.env.example b/llm_core/.env.example
@@ -0,0 +1,41 @@
+# Comment out the variables that you define somewhere else
+# Environment variables are overwritten by .env file
+
+PRODUCTION=0
+SECRET=12345abcdef
+DATABASE_URL=sqlite:///../data/data.sqlite
+
+
+################################################################
+# LLM Credentials                                              #
+################################################################
+
+# Default model to use
+# See below for options, available models are also logged on startup
+LLM_DEFAULT_MODEL="azure_openai_gpt-4o"
+LLM_DEFAULT_MODEL_COST_PER_MILLION_INPUT_TOKEN=5
+LLM_DEFAULT_MODEL_COST_PER_MILLION_OUTPUT_TOKEN=15
+
+# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
+LLM_ENABLE_LLM_AS_A_JUDGE=1
+# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
+# See below for options, available models are also logged on startup
+LLM_EVALUATION_MODEL="azure_openai_gpt-4o"
+
+# Standard OpenAI (Non-Azure) [leave blank if not used]
+# Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
+# A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
+OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+
+# Azure OpenAI [leave blank if not used]
+# Model names prefixed with `azure_openai_` followed by the deployment id, e.g. `azure_openai_gpt-35`
+AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+AZURE_OPENAI_ENDPOINT="https://ase-eu01.openai.azure.com/" # change base if needed
+OPENAI_API_VERSION="2024-06-01" # change base if needed
+
+# LangSmith (can be used for tracing LLMs) [leave blank if not used]
+# See https://docs.smith.langchain.com
+# LANGCHAIN_TRACING_V2=true
+# LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+# LANGCHAIN_API_KEY="XXX"
+# LANGCHAIN_PROJECT="XXX"
diff --git a/llm_core/Dockerfile b/llm_core/Dockerfile
@@ -0,0 +1,24 @@
+# syntax=docker/dockerfile:1
+
+# This is the Dockerfile for the shared llm package.
+# Its output is used as a dependency in the module_* Dockerfiles.
+
+FROM python:3.11 as llm_core
+
+WORKDIR /code
+
+# Poetry
+RUN pip install --no-cache-dir poetry==1.5.0
+
+# Dependencies
+COPY pyproject.toml poetry.lock ./
+COPY --from=athena /code /athena
+
+RUN poetry config virtualenvs.create false \
+    && poetry install --no-interaction --no-ansi
+
+# Project files
+COPY . ./
+
+# Build the package
+RUN poetry build -f wheel
diff --git a/llm_core/llm_core/__init__.py b/llm_core/llm_core/__init__.py
@@ -0,0 +1,4 @@
+import dotenv
+
+# Load environment variables from .env file (for local development)
+dotenv.load_dotenv(override=True)
diff --git a/...lm/module_modeling_llm/models/__init__.py → llm_core/llm_core/models/__init__.py b/...lm/module_modeling_llm/models/__init__.py → llm_core/llm_core/models/__init__.py
@@ -2,8 +2,7 @@
 from typing import Type, Union, List, Optional
 from langchain.base_language import BaseLanguageModel
 
-from module_modeling_llm.models.model_config import ModelConfig
-
+from llm_core.models.model_config import ModelConfig
 
 
 DefaultModelConfig: Type[ModelConfig]
@@ -15,7 +14,7 @@
 
 types: List[Type[ModelConfig]] = []
 try:
-    import module_modeling_llm.models.openai as openai_config
+    import llm_core.models.openai as openai_config
     types.append(openai_config.OpenAIModelConfig)
     if default_model_name in openai_config.available_models:
         DefaultModelConfig = openai_config.OpenAIModelConfig
@@ -36,4 +35,4 @@
     ModelConfigType = type0
 else:
     type1 = types[1]
-    ModelConfigType = Union[type0, type1] # type: ignore
+    ModelConfigType = Union[type0, type1] # type: ignore
diff --git a/llm_core/llm_core/models/callbacks.py b/llm_core/llm_core/models/callbacks.py
@@ -0,0 +1,43 @@
+import os
+
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain_core.outputs import  LLMResult
+from langchain_core.messages.ai import UsageMetadata
+
+from athena import emit_meta, get_meta
+
+
+class UsageHandler(BaseCallbackHandler):
+    def on_llm_end(self, response: LLMResult, **kwargs) -> None:
+        cost_per_million_input_tokens = float(os.environ.get("LLM_DEFAULT_MODEL_COST_PER_MILLION_INPUT_TOKEN", 0.0))
+        cost_per_million_output_tokens = float(os.environ.get("LLM_DEFAULT_MODEL_COST_PER_MILLION_OUTPUT_TOKEN", 0.0))
+
+        meta = get_meta()
+
+        total_usage = meta.get("totalUsage", {"numInputTokens": 0, "numOutputTokens": 0, "numTotalTokens": 0, "cost": 0 })
+        llm_calls = meta.get("llmRequests", [])
+
+        for generations in response.generations:
+            for generation in generations:
+                message = generation.dict()["message"]
+                generation_usage: UsageMetadata = message["usage_metadata"]
+                model_name = message["response_metadata"].get("model_name", None)
+
+                total_usage["numInputTokens"] += generation_usage["input_tokens"]
+                total_usage["numOutputTokens"] += generation_usage["output_tokens"]
+                total_usage["numTotalTokens"] += generation_usage["total_tokens"]
+
+                total_usage["cost"] += int(generation_usage["input_tokens"]) * cost_per_million_output_tokens / 1_000_000
+                total_usage["cost"] += int(generation_usage["output_tokens"]) * cost_per_million_output_tokens / 1_000_000
+
+                llm_calls.append({
+                    "model": model_name,
+                    "costPerMillionInputToken": cost_per_million_input_tokens,
+                    "costPerMillionOutputToken": cost_per_million_output_tokens,
+                    "numInputTokens": generation_usage["input_tokens"],
+                    "numOutputTokens": generation_usage["output_tokens"],
+                    "numTotalTokens": generation_usage["total_tokens"],
+                })
+
+        emit_meta("totalUsage", total_usage)
+        emit_meta("llmRequests", llm_calls)
diff --git a/...e_text_llm/helpers/models/model_config.py → llm_core/llm_core/models/model_config.py b/...e_text_llm/helpers/models/model_config.py → llm_core/llm_core/models/model_config.py
diff --git a/.../module_text_llm/helpers/models/openai.py → llm_core/llm_core/models/openai.py b/.../module_text_llm/helpers/models/openai.py → llm_core/llm_core/models/openai.py
diff --git a/..._llm/module_text_llm/helpers/llm_utils.py → llm_core/llm_core/utils/llm_utils.py b/..._llm/module_text_llm/helpers/llm_utils.py → llm_core/llm_core/utils/llm_utils.py
@@ -1,20 +1,15 @@
-from typing import Optional, Type, TypeVar, List
-from pydantic import BaseModel, ValidationError
+from typing import Type, TypeVar, List
+from pydantic import BaseModel
 import tiktoken
-
-from langchain.chains import LLMChain
 from langchain.chat_models import ChatOpenAI
 from langchain.base_language import BaseLanguageModel
 from langchain.prompts import (
     ChatPromptTemplate,
     SystemMessagePromptTemplate,
     HumanMessagePromptTemplate,
 )
-from langchain.chains.openai_functions import create_structured_output_chain
 from langchain.output_parsers import PydanticOutputParser
-from langchain.schema import OutputParserException
-
-from athena import emit_meta, get_experiment_environment
+from athena import emit_meta
 
 T = TypeVar("T", bound=BaseModel)
 
@@ -31,9 +26,9 @@ def num_tokens_from_prompt(chat_prompt: ChatPromptTemplate, prompt_input: dict)
     return num_tokens_from_string(chat_prompt.format(**prompt_input))
 
 
-def check_prompt_length_and_omit_features_if_necessary(prompt: ChatPromptTemplate, 
-                                                       prompt_input: dict, 
-                                                       max_input_tokens: int, 
+def check_prompt_length_and_omit_features_if_necessary(prompt: ChatPromptTemplate,
+                                                       prompt_input: dict,
+                                                       max_input_tokens: int,
                                                        omittable_features: List[str],
                                                        debug: bool):
     """Check if the input is too long and omit features if necessary.
@@ -48,7 +43,7 @@ def check_prompt_length_and_omit_features_if_necessary(prompt: ChatPromptTemplat
         debug (bool): Debug flag
 
     Returns:
-        (dict, bool): Tuple of (prompt_input, should_run) where prompt_input is the input with omitted features and 
+        (dict, bool): Tuple of (prompt_input, should_run) where prompt_input is the input with omitted features and
                       should_run is True if the model should run, False otherwise
     """
     if num_tokens_from_prompt(prompt, prompt_input) <= max_input_tokens:
@@ -84,11 +79,11 @@ def supports_function_calling(model: BaseLanguageModel):
 
 
 def get_chat_prompt_with_formatting_instructions(
-            model: BaseLanguageModel,
-            system_message: str, 
-            human_message: str,
-            pydantic_object: Type[T]
-        ) -> ChatPromptTemplate:
+        model: BaseLanguageModel,
+        system_message: str,
+        human_message: str,
+        pydantic_object: Type[T]
+) -> ChatPromptTemplate:
     """Returns a ChatPromptTemplate with formatting instructions (if necessary)
 
     Note: Does nothing if the model supports function calling
@@ -106,57 +101,10 @@ def get_chat_prompt_with_formatting_instructions(
         system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)
         human_message_prompt = HumanMessagePromptTemplate.from_template(human_message)
         return ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
-    
+
     output_parser = PydanticOutputParser(pydantic_object=pydantic_object)
     system_message_prompt = SystemMessagePromptTemplate.from_template(system_message + "\n{format_instructions}")
     system_message_prompt.prompt.partial_variables = {"format_instructions": output_parser.get_format_instructions()}
     system_message_prompt.prompt.input_variables.remove("format_instructions")
     human_message_prompt = HumanMessagePromptTemplate.from_template(human_message + "\n\nJSON response following the provided schema:")
-    return ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
-
-
-async def predict_and_parse(
-        model: BaseLanguageModel, 
-        chat_prompt: ChatPromptTemplate, 
-        prompt_input: dict, 
-        pydantic_object: Type[T], 
-        tags: Optional[List[str]]
-    ) -> Optional[T]:
-    """Predicts an LLM completion using the model and parses the output using the provided Pydantic model
-
-    Args:
-        model (BaseLanguageModel): The model to predict with
-        chat_prompt (ChatPromptTemplate): Prompt to use
-        prompt_input (dict): Input parameters to use for the prompt
-        pydantic_object (Type[T]): Pydantic model to parse the output
-        tags (Optional[List[str]]: List of tags to tag the prediction with
-
-    Returns:
-        Optional[T]: Parsed output, or None if it could not be parsed
-    """
-    experiment = get_experiment_environment()
-
-    tags = tags or []
-    if experiment.experiment_id is not None:
-        tags.append(f"experiment-{experiment.experiment_id}")
-    if experiment.module_configuration_id is not None:
-        tags.append(f"module-configuration-{experiment.module_configuration_id}")
-    if experiment.run_id is not None:
-        tags.append(f"run-{experiment.run_id}")
-
-    if supports_function_calling(model):
-        chain = create_structured_output_chain(pydantic_object, llm=model, prompt=chat_prompt, tags=tags)
-
-        try:
-            return await chain.arun(**prompt_input)
-        except (OutputParserException, ValidationError):
-            # In the future, we should probably have some recovery mechanism here (i.e. fix the output with another prompt)
-            return None
-
-    output_parser = PydanticOutputParser(pydantic_object=pydantic_object)
-    chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=output_parser, tags=tags)
-    try:
-        return await chain.arun(**prompt_input)
-    except (OutputParserException, ValidationError):
-        # In the future, we should probably have some recovery mechanism here (i.e. fix the output with another prompt)
-        return None
+    return ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
diff --git a/...e_modeling_llm/utils/predict_and_parse.py → llm_core/llm_core/utils/predict_and_parse.py b/...e_modeling_llm/utils/predict_and_parse.py → llm_core/llm_core/utils/predict_and_parse.py
@@ -8,12 +8,24 @@
 T = TypeVar("T", bound=BaseModel)
 
 async def predict_and_parse(
-        model: BaseLanguageModel,
-        chat_prompt: ChatPromptTemplate,
-        prompt_input: dict,
-        pydantic_object: Type[T],
+        model: BaseLanguageModel, 
+        chat_prompt: ChatPromptTemplate, 
+        prompt_input: dict, 
+        pydantic_object: Type[T], 
         tags: Optional[List[str]]
-) -> Optional[T]:
+    ) -> Optional[T]:
+    """Predicts an LLM completion using the model and parses the output using the provided Pydantic model
+
+    Args:
+        model (BaseLanguageModel): The model to predict with
+        chat_prompt (ChatPromptTemplate): Prompt to use
+        prompt_input (dict): Input parameters to use for the prompt
+        pydantic_object (Type[T]): Pydantic model to parse the output
+        tags (Optional[List[str]]: List of tags to tag the prediction with
+
+    Returns:
+        Optional[T]: Parsed output, or None if it could not be parsed
+    """
     experiment = get_experiment_environment()
 
     tags = tags or []