Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple Selectable Approaches for Text (Chain of Thought with Prompt Chaining) #350

Merged
merged 9 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llm_core/llm_core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


DefaultModelConfig: Type[ModelConfig]
MiniModelConfig: ModelConfig
EneaGore marked this conversation as resolved.
Show resolved Hide resolved
default_model_name = os.environ.get("LLM_DEFAULT_MODEL")
evaluation_model_name = os.environ.get("LLM_EVALUATION_MODEL")

Expand All @@ -18,6 +19,8 @@
types.append(openai_config.OpenAIModelConfig)
if default_model_name in openai_config.available_models:
DefaultModelConfig = openai_config.OpenAIModelConfig
if "openai_gpt-4o-mini" in openai_config.available_models:
MiniModelConfig = openai_config.OpenAIModelConfig(model_name="openai_gpt-4o-mini",max_tokens=3000, temperature=0,top_p=0.9,presence_penalty=0,frequency_penalty=0)
if evaluation_model_name in openai_config.available_models:
evaluation_model = openai_config.available_models[evaluation_model_name]
except AttributeError:
Expand Down
4 changes: 2 additions & 2 deletions llm_core/llm_core/utils/llm_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Type, TypeVar, List
from pydantic import BaseModel
import tiktoken
from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain.base_language import BaseLanguageModel
from langchain.prompts import (
ChatPromptTemplate,
Expand Down Expand Up @@ -75,7 +75,7 @@ def supports_function_calling(model: BaseLanguageModel):
Returns:
boolean: True if the model supports function calling, False otherwise
"""
return isinstance(model, ChatOpenAI)
return isinstance(model, ChatOpenAI) or isinstance(model, AzureChatOpenAI)
EneaGore marked this conversation as resolved.
Show resolved Hide resolved


def get_chat_prompt_with_formatting_instructions(
Expand Down
40 changes: 29 additions & 11 deletions llm_core/llm_core/utils/predict_and_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ async def predict_and_parse(
chat_prompt: ChatPromptTemplate,
prompt_input: dict,
pydantic_object: Type[T],
tags: Optional[List[str]]
tags: Optional[List[str]],
use_function_calling: bool = False
) -> Optional[T]:
"""Predicts an LLM completion using the model and parses the output using the provided Pydantic model

Expand All @@ -36,13 +37,30 @@ async def predict_and_parse(
if experiment.run_id is not None:
tags.append(f"run-{experiment.run_id}")

structured_output_llm = model.with_structured_output(pydantic_object, method="json_mode")
chain = RunnableSequence(
chat_prompt,
structured_output_llm
)

try:
return await chain.ainvoke(prompt_input, config={"tags": tags})
except ValidationError as e:
raise ValueError(f"Could not parse output: {e}") from e

if (use_function_calling):
structured_output_llm = model.with_structured_output(pydantic_object)
chain = chat_prompt | structured_output_llm

try:
result = await chain.ainvoke(prompt_input, config={"tags": tags})

if isinstance(result, pydantic_object):
return result
else:
raise ValueError("Parsed output does not match the expected Pydantic model.")

except ValidationError as e:
raise ValueError(f"Could not parse output: {e}") from e

else:
structured_output_llm = model.with_structured_output(pydantic_object, method = "json_mode")
chain = RunnableSequence(
chat_prompt,
structured_output_llm
)
try:
return await chain.ainvoke(prompt_input, config={"tags": tags})
except ValidationError as e:
raise ValueError(f"Could not parse output: {e}") from e

6 changes: 1 addition & 5 deletions modules/text/module_text_llm/module_text_llm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
import json
import os
from typing import List, Any

import nltk
import tiktoken

from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import Configuration
from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
from module_text_llm.generate_suggestions import generate_suggestions
from module_text_llm.generate_evaluation import generate_evaluation

from module_text_llm.approach_controller import generate_suggestions

@submissions_consumer
def receive_submissions(exercise: Exercise, submissions: List[Submission]):
Expand All @@ -30,7 +27,6 @@ def select_submission(exercise: Exercise, submissions: List[Submission]) -> Subm
def process_incoming_feedback(exercise: Exercise, submission: Submission, feedbacks: List[Feedback]):
logger.info("process_feedback: Received %d feedbacks for submission %d of exercise %d.", len(feedbacks), submission.id, exercise.id)


@feedback_provider
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: %s suggestions for submission %d of exercise %d were requested",
Expand Down
16 changes: 16 additions & 0 deletions modules/text/module_text_llm/module_text_llm/approach_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from abc import ABC
from pydantic import BaseModel, Field
from llm_core.models import ModelConfigType, DefaultModelConfig
from enum import Enum

class ApproachType(str, Enum):
basic = "BasicApproach"
chain_of_thought = "ChainOfThought"

class ApproachConfig(BaseModel, ABC):
max_input_tokens: int = Field(default=3000, description="Maximum number of tokens in the input prompt.")
model: ModelConfigType = Field(default=DefaultModelConfig())
type: str = Field(..., description="The type of approach config")

class Config:
use_enum_values = True
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

from typing import List
from athena.text import Exercise, Submission, Feedback
from module_text_llm.basic_approach import BasicApproachConfig
from module_text_llm.chain_of_thought_approach import ChainOfThoughtConfig
from module_text_llm.approach_config import ApproachConfig

from module_text_llm.basic_approach.generate_suggestions import generate_suggestions as generate_suggestions_basic
from module_text_llm.chain_of_thought_approach.generate_suggestions import generate_suggestions as generate_cot_suggestions

async def generate_suggestions(exercise: Exercise, submission: Submission, config: ApproachConfig, debug: bool) -> List[Feedback]:
if(isinstance(config, BasicApproachConfig)):
return await generate_suggestions_basic(exercise, submission, config, debug)
elif(isinstance(config, ChainOfThoughtConfig)):
return await generate_cot_suggestions(exercise, submission, config, debug)

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from module_text_llm.approach_config import ApproachConfig
from pydantic import Field
from typing import Literal


from module_text_llm.basic_approach.prompt_generate_suggestions import GenerateSuggestionsPrompt

class BasicApproachConfig(ApproachConfig):
type: Literal['basic'] = 'basic'
generate_suggestions_prompt: GenerateSuggestionsPrompt = Field(default=GenerateSuggestionsPrompt())

Original file line number Diff line number Diff line change
@@ -1,46 +1,22 @@
from typing import List, Optional, Sequence
from pydantic import BaseModel, Field
from typing import List

from athena import emit_meta
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import BasicApproachConfig
from llm_core.utils.llm_utils import (
get_chat_prompt_with_formatting_instructions,
check_prompt_length_and_omit_features_if_necessary,
num_tokens_from_prompt,
)
from athena.text import Exercise, Submission, Feedback
from llm_core.utils.predict_and_parse import predict_and_parse

from module_text_llm.config import BasicApproachConfig
from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions

class FeedbackModel(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
credits: float = Field(0.0, description="Number of points received/deducted")
grading_instruction_id: Optional[int] = Field(
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
)

class Config:
title = "Feedback"


class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks")

class Config:
title = "Assessment"

from module_text_llm.basic_approach.prompt_generate_suggestions import AssessmentModel

async def generate_suggestions(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> List[Feedback]:
model = config.model.get_model() # type: ignore[attr-defined]

prompt_input = {
"max_points": exercise.max_points,
"bonus_points": exercise.bonus_points,
Expand Down Expand Up @@ -83,7 +59,8 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
tags=[
f"exercise-{exercise.id}",
f"submission-{submission.id}",
]
],
use_function_calling=True
)

if debug:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from pydantic import Field, BaseModel
from typing import List, Optional
from pydantic import BaseModel, Field

system_message = """\
You are an AI tutor for text assessment at a prestigious university.

# Task
Create graded feedback suggestions for a student\'s text submission that a human tutor would accept. \
Meaning, the feedback you provide should be applicable to the submission with little to no modification.

# Style
1. Constructive, 2. Specific, 3. Balanced, 4. Clear and Concise, 5. Actionable, 6. Educational, 7. Contextual

# Problem statement
{problem_statement}

# Example solution
{example_solution}

# Grading instructions
{grading_instructions}
Max points: {max_points}, bonus points: {bonus_points}\

Respond in json.
"""

human_message = """\
Student\'s submission to grade (with sentence numbers <number>: <sentence>):

Respond in json.

\"\"\"
{submission}
\"\"\"\
"""

# Input Prompt
class GenerateSuggestionsPrompt(BaseModel):
"""\
Features available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**, **{bonus_points}**, **{submission}**

_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input is too long._\
"""
system_message: str = Field(default=system_message,
description="Message for priming AI behavior and instructing it what to do.")
human_message: str = Field(default=human_message,
description="Message from a human. The input on which the AI is supposed to act.")
# Output Object
class FeedbackModel(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
credits: float = Field(0.0, description="Number of points received/deducted")
grading_instruction_id: Optional[int] = Field(
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
)


class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: List[FeedbackModel] = Field(description="Assessment feedbacks")

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pydantic import BaseModel, Field
from typing import Literal
from llm_core.models import ModelConfigType, MiniModelConfig

from module_text_llm.approach_config import ApproachConfig
from module_text_llm.chain_of_thought_approach.prompt_generate_feedback import CoTGenerateSuggestionsPrompt
from module_text_llm.chain_of_thought_approach.prompt_thinking import ThinkingPrompt

class ChainOfThoughtConfig(ApproachConfig):
# Defaults to the cheaper mini 4o model
type: Literal['chain_of_thought'] = 'chain_of_thought'
model: ModelConfigType = Field(default=MiniModelConfig) # type: ignore
thikning_prompt: ThinkingPrompt = Field(default=ThinkingPrompt())
generate_suggestions_prompt: CoTGenerateSuggestionsPrompt = Field(default=CoTGenerateSuggestionsPrompt())

Loading
Loading