Skip to content

fix: Continue summarizer on error and add more logging #2220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker-compose.services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ services:
environment:
- KEYCLOAK_ADMIN=${KEYCLOAK_SVC_ADMIN:-admin}
- KEYCLOAK_ADMIN_PASSWORD=${KEYCLOAK_SVC_ADMIN_PASSWORD:-admin}
- "_JAVA_OPTIONS=${JAVA_OPTIONS:-}" # Load _JAVA_OPTIONS from env, fallback to empty string
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My Keycloak container was not starting up because of java errors so I had to do the same thing here that I did with opensearch container. #2069 & #2082

networks:
default:
aliases:
Expand Down
55 changes: 42 additions & 13 deletions learning_resources/content_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from langchain_community.chat_models import ChatLiteLLM
from typing_extensions import TypedDict

from learning_resources.exceptions import (
FlashcardsGenerationError,
SummaryGenerationError,
)
from learning_resources.models import (
ContentFile,
ContentSummarizerConfiguration,
Expand Down Expand Up @@ -124,31 +128,36 @@ def get_unprocessed_content_file_ids(

def summarize_content_files_by_ids(
self, content_file_ids: list[int], overwrite
) -> None:
) -> list[str]:
"""Process multiple content files by id.

Args:
- ids (list[int]): List of content file ids to process
- overwrite (bool): Whether to overwrite existing summary and flashcards

Returns:
- None
- list[str]: List of status messages for each content file
"""
status_messages = []
for content_file_id in content_file_ids:
self.summarize_single_content_file(content_file_id, overwrite=overwrite)
status_msg = self.summarize_single_content_file(
content_file_id, overwrite=overwrite
)
status_messages.append(status_msg)
return status_messages

def summarize_single_content_file(
self,
content_file_id: int,
overwrite,
) -> None:
) -> str:
"""Process a single content file
Args:
- content_file_id (int): Id of the content file to process
- overwrite (bool): Whether to overwrite existing summary and flashcards

Returns:
- None
- str: A string message indicating the status of the summarization
"""
try:
with transaction.atomic():
Expand All @@ -175,10 +184,24 @@ def summarize_single_content_file(

if updated:
content_file.save()

except Exception:
return f"Summarization succeeded for CONTENT_FILE_ID: {content_file_id}" # noqa: E501
return f"Summarization skipped for CONTENT_FILE_ID: {content_file_id}"
except SummaryGenerationError as exc:
# Log and return a specific readable error message when summary
# generation fails.
logger.exception("Error processing content: %d", content_file.id)
return f"Summary generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501
except FlashcardsGenerationError as exc:
# Log and return a specific readable error message when flashcards
# generation fails.
return f"Flashcards generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501
except Exception as exc:
# Log and return a specific readable error message when an unknown
# error occurs.
logger.exception("Error processing content: %d", content_file.id)
raise
return (
f"Summarization failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n", # noqa: E501
)

def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
"""Get the ChatLiteLLM instance"""
Expand Down Expand Up @@ -216,11 +239,15 @@ def _generate_summary(self, content: str, llm_model: str) -> str:
generated_summary = response.content
logger.info("Generated summary: %s", generated_summary)

except Exception:
except Exception as exc:
# We do not want to raise the exception as is, we will log the exception and
# raise SummaryGenerationError that will be used to make further decisions
# in the code.
logger.exception(
"An error occurred while generating summary using model: %s", llm_model
)
raise
raise SummaryGenerationError(exc) from exc

else:
return generated_summary

Expand All @@ -243,12 +270,14 @@ def _generate_flashcards(
)
generated_flashcards = response.get("flashcards")
logger.info("Generated flashcards: %s", generated_flashcards)

except Exception:
except Exception as exc:
# We do not want to raise the exception as is, we will log the exception and
# raise FlashcardsGenerationError that will be used to make further
# decisions in the code.
logger.exception(
"An error occurred while generating flashcards using model: %s",
llm_model,
)
raise
raise FlashcardsGenerationError(exc) from exc
else:
return generated_flashcards
84 changes: 82 additions & 2 deletions learning_resources/content_summarizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
PlatformType,
)
from learning_resources.content_summarizer import ContentSummarizer
from learning_resources.exceptions import (
FlashcardsGenerationError,
SummaryGenerationError,
)
from learning_resources.factories import (
ContentFileFactory,
ContentSummarizerConfigurationFactory,
Expand Down Expand Up @@ -242,9 +246,9 @@ def test_get_unprocessed_content_files_with_platform_and_config(
def test_summarize_content_files_by_ids(
processable_content_files, mock_summarize_single_content_file
):
"""The summarizer should process content files that are processable"""
"""The summarizer should process content files that are processable and return the status results"""
summarizer = ContentSummarizer()
summarizer.summarize_content_files_by_ids(
results = summarizer.summarize_content_files_by_ids(
overwrite=False,
content_file_ids=[
content_file.id for content_file in processable_content_files
Expand All @@ -253,6 +257,8 @@ def test_summarize_content_files_by_ids(
assert mock_summarize_single_content_file.call_count == len(
processable_content_files
)
assert isinstance(results, list)
assert len(results) == len(processable_content_files)


def test_summarize_single_content_file(mocker, processable_content_files):
Expand Down Expand Up @@ -332,3 +338,77 @@ def test_process_single_file_calls_llm_summary(
assert mock_instance.with_structured_output.call_count == 1
elif has_flashcards:
assert mock_instance.invoke.call_count == 1


@pytest.mark.parametrize(
("process_type", "expected_exception"),
[("summary", SummaryGenerationError), ("flashcards", FlashcardsGenerationError)],
)
def test_generate_summary_flashcards_exception(
mocker, processable_content_files, settings, process_type, expected_exception
):
"""Test the exception handling in the generate_summary and generate_flashcards methods"""
settings.OPENAI_API_KEY = "test"
summarizer = ContentSummarizer()
content_file = processable_content_files[0]
content_file.save()

# Mock the ChatLiteLLM class and its methods
mock_chat_llm = mocker.patch(
"learning_resources.content_summarizer.ChatLiteLLM", autospec=True
)
mock_instance = mock_chat_llm.return_value

# Mock the response for _generate_summary to raise an exception
mock_instance.invoke.side_effect = Exception("Test exception")
# Mock the response for _generate_flashcards to raise an exception
mock_instance.with_structured_output.return_value.invoke.side_effect = Exception(
"INVALID_FORMAT"
)

if process_type == "summary":
with pytest.raises(expected_exception):
summarizer._generate_summary( # noqa: SLF001
llm_model="llm_model", content=content_file.content
)
else:
with pytest.raises(expected_exception):
summarizer._generate_flashcards( # noqa: SLF001
llm_model="llm_model", content=content_file.content
)


def test_summarize_single_content_file_with_exception(
mocker, processable_content_files, settings
):
"""Test the exception handling in the summarize_single_content_file method"""
settings.OPENAI_API_KEY = "test"
summarizer = ContentSummarizer()
content_file = processable_content_files[0]

# Mock the ChatLiteLLM class and its methods
mock_chat_llm = mocker.patch(
"learning_resources.content_summarizer.ChatLiteLLM", autospec=True
)
mock_instance = mock_chat_llm.return_value

# Mock the response for _generate_summary to raise an exception
mock_instance.invoke.side_effect = Exception("Test exception")
# Mock the response for _generate_flashcards to raise an exception
mock_instance.with_structured_output.return_value.invoke.side_effect = Exception(
"INVALID_FORMAT"
)

error = summarizer.summarize_single_content_file(content_file.id, overwrite=False)
assert (
error
== f"Summary generation failed for CONTENT_FILE_ID: {content_file.id}\nError: Test exception\n\n"
)
content_file.summary = "Test summary"
content_file.save()
content_file.refresh_from_db()
error = summarizer.summarize_single_content_file(content_file.id, overwrite=False)
assert (
error
== f"Flashcards generation failed for CONTENT_FILE_ID: {content_file.id}\nError: INVALID_FORMAT\n\n"
)
8 changes: 8 additions & 0 deletions learning_resources/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ class PostHogAuthenticationError(Exception):

class PostHogQueryError(Exception):
"""Raised if the PostHog query API returns a non-authentication error."""


class SummaryGenerationError(Exception):
"""Raised if the summary generation fails for a content file."""


class FlashcardsGenerationError(Exception):
"""Raised if the flashcards generation fails for a content file."""
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Management command to run the content summarizer"""

import itertools

from django.conf import settings
from django.core.management import BaseCommand

Expand Down Expand Up @@ -91,9 +93,16 @@ def handle(self, *args, **options): # noqa: ARG002
self.stdout.write("Waiting on task...")

start = now_in_utc()
summarizer_task.get()
results = summarizer_task.get()

# Log the summarization stats
flat_results = list(itertools.chain(*results))
for result in flat_results:
self.stdout.write(f"{result}")

total_seconds = (now_in_utc() - start).total_seconds()
self.stdout.write(
f"Content file summarizer finished, took {total_seconds} seconds"
self.style.SUCCESS(
f"Content file summarizer finished, took {total_seconds} seconds"
)
)
2 changes: 1 addition & 1 deletion learning_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def summarize_content_files_task(
- None
"""
summarizer = ContentSummarizer()
summarizer.summarize_content_files_by_ids(content_file_ids, overwrite)
return summarizer.summarize_content_files_by_ids(content_file_ids, overwrite)


@app.task(bind=True, acks_late=True)
Expand Down
Loading