From ba7876f868efc83324c38985160111f4906ed03b Mon Sep 17 00:00:00 2001 From: Arslan Ashraf Date: Wed, 30 Apr 2025 15:40:46 +0500 Subject: [PATCH 1/2] fix: Add logs and continue sumarizer on error --- docker-compose.services.yml | 1 + learning_resources/content_summarizer.py | 42 ++++++++++++++----- learning_resources/exceptions.py | 8 ++++ .../commands/generate_summary_flashcards.py | 13 +++++- learning_resources/tasks.py | 2 +- 5 files changed, 52 insertions(+), 14 deletions(-) diff --git a/docker-compose.services.yml b/docker-compose.services.yml index 3ec0cb1928..3cceb7fbb7 100644 --- a/docker-compose.services.yml +++ b/docker-compose.services.yml @@ -101,6 +101,7 @@ services: environment: - KEYCLOAK_ADMIN=${KEYCLOAK_SVC_ADMIN:-admin} - KEYCLOAK_ADMIN_PASSWORD=${KEYCLOAK_SVC_ADMIN_PASSWORD:-admin} + - "_JAVA_OPTIONS=${JAVA_OPTIONS:-}" # Load _JAVA_OPTIONS from env, fallback to empty string networks: default: aliases: diff --git a/learning_resources/content_summarizer.py b/learning_resources/content_summarizer.py index 2c4a02ce67..88f9d2891c 100644 --- a/learning_resources/content_summarizer.py +++ b/learning_resources/content_summarizer.py @@ -7,6 +7,10 @@ from langchain_community.chat_models import ChatLiteLLM from typing_extensions import TypedDict +from learning_resources.exceptions import ( + FlashcardsGenerationError, + SummaryGenerationError, +) from learning_resources.models import ( ContentFile, ContentSummarizerConfiguration, @@ -134,21 +138,26 @@ def summarize_content_files_by_ids( Returns: - None """ + status_messages = [] for content_file_id in content_file_ids: - self.summarize_single_content_file(content_file_id, overwrite=overwrite) + status_msg = self.summarize_single_content_file( + content_file_id, overwrite=overwrite + ) + status_messages.append(status_msg) + return status_messages def summarize_single_content_file( self, content_file_id: int, overwrite, - ) -> None: + ) -> tuple[bool, str]: """Process a single content file Args: - content_file_id (int): Id of the content file to process - overwrite (bool): Whether to overwrite existing summary and flashcards Returns: - - None + - str: A string message indicating the status of the summarization """ try: with transaction.atomic(): @@ -175,10 +184,20 @@ def summarize_single_content_file( if updated: content_file.save() + return f"Content file summarization succeeded for CONTENT_FILE_ID: {content_file_id}" # noqa: E501 + return f"Content file summarization skipped for CONTENT_FILE_ID: {content_file_id}" # noqa: E501 + + except SummaryGenerationError as exc: + return f"Content file summary generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 - except Exception: + except FlashcardsGenerationError as exc: + return f"Content file flashcards generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 + except Exception as exc: logger.exception("Error processing content: %d", content_file.id) - raise + return ( + False, + f"Content file summarization failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n", # noqa: E501 + ) def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM: """Get the ChatLiteLLM instance""" @@ -216,13 +235,14 @@ def _generate_summary(self, content: str, llm_model: str) -> str: generated_summary = response.content logger.info("Generated summary: %s", generated_summary) - except Exception: + except Exception as exc: logger.exception( "An error occurred while generating summary using model: %s", llm_model ) - raise + raise SummaryGenerationError(exc) from exc + else: - return generated_summary + return True, generated_summary def _generate_flashcards( self, content: str, llm_model: str @@ -243,12 +263,12 @@ def _generate_flashcards( ) generated_flashcards = response.get("flashcards") logger.info("Generated flashcards: %s", generated_flashcards) - - except Exception: + except Exception as exc: logger.exception( "An error occurred while generating flashcards using model: %s", llm_model, ) - raise + raise FlashcardsGenerationError(exc) from exc + else: return generated_flashcards diff --git a/learning_resources/exceptions.py b/learning_resources/exceptions.py index dd1b6296dc..eb67b19d2b 100644 --- a/learning_resources/exceptions.py +++ b/learning_resources/exceptions.py @@ -13,3 +13,11 @@ class PostHogAuthenticationError(Exception): class PostHogQueryError(Exception): """Raised if the PostHog query API returns a non-authentication error.""" + + +class SummaryGenerationError(Exception): + """Raised if the summary generation fails for a content file.""" + + +class FlashcardsGenerationError(Exception): + """Raised if the flashcards generation fails for a content file.""" diff --git a/learning_resources/management/commands/generate_summary_flashcards.py b/learning_resources/management/commands/generate_summary_flashcards.py index 9cde1dc462..e79fbb68b2 100644 --- a/learning_resources/management/commands/generate_summary_flashcards.py +++ b/learning_resources/management/commands/generate_summary_flashcards.py @@ -1,5 +1,7 @@ """Management command to run the content summarizer""" +import itertools + from django.conf import settings from django.core.management import BaseCommand @@ -91,9 +93,16 @@ def handle(self, *args, **options): # noqa: ARG002 self.stdout.write("Waiting on task...") start = now_in_utc() - summarizer_task.get() + results = summarizer_task.get() + + # Log the summarization stats + flat_results = list(itertools.chain(*results)) + for result in flat_results: + self.stdout.write(f"{result}") total_seconds = (now_in_utc() - start).total_seconds() self.stdout.write( - f"Content file summarizer finished, took {total_seconds} seconds" + self.style.SUCCESS( + f"Content file summarizer finished, took {total_seconds} seconds" + ) ) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index d2dd0c7d3d..767691e1d6 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -424,7 +424,7 @@ def summarize_content_files_task( - None """ summarizer = ContentSummarizer() - summarizer.summarize_content_files_by_ids(content_file_ids, overwrite) + return summarizer.summarize_content_files_by_ids(content_file_ids, overwrite) @app.task(bind=True, acks_late=True) From f8e63f52b6b579e1fb822bae5f71c03d03171d69 Mon Sep 17 00:00:00 2001 From: Arslan Ashraf Date: Fri, 2 May 2025 15:01:40 +0500 Subject: [PATCH 2/2] add tests --- learning_resources/content_summarizer.py | 35 +++++--- learning_resources/content_summarizer_test.py | 84 ++++++++++++++++++- 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/learning_resources/content_summarizer.py b/learning_resources/content_summarizer.py index 88f9d2891c..04952ab6ae 100644 --- a/learning_resources/content_summarizer.py +++ b/learning_resources/content_summarizer.py @@ -128,7 +128,7 @@ def get_unprocessed_content_file_ids( def summarize_content_files_by_ids( self, content_file_ids: list[int], overwrite - ) -> None: + ) -> list[str]: """Process multiple content files by id. Args: @@ -136,7 +136,7 @@ def summarize_content_files_by_ids( - overwrite (bool): Whether to overwrite existing summary and flashcards Returns: - - None + - list[str]: List of status messages for each content file """ status_messages = [] for content_file_id in content_file_ids: @@ -150,7 +150,7 @@ def summarize_single_content_file( self, content_file_id: int, overwrite, - ) -> tuple[bool, str]: + ) -> str: """Process a single content file Args: - content_file_id (int): Id of the content file to process @@ -184,19 +184,23 @@ def summarize_single_content_file( if updated: content_file.save() - return f"Content file summarization succeeded for CONTENT_FILE_ID: {content_file_id}" # noqa: E501 - return f"Content file summarization skipped for CONTENT_FILE_ID: {content_file_id}" # noqa: E501 - + return f"Summarization succeeded for CONTENT_FILE_ID: {content_file_id}" # noqa: E501 + return f"Summarization skipped for CONTENT_FILE_ID: {content_file_id}" except SummaryGenerationError as exc: - return f"Content file summary generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 - + # Log and return a specific readable error message when summary + # generation fails. + logger.exception("Error processing content: %d", content_file.id) + return f"Summary generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 except FlashcardsGenerationError as exc: - return f"Content file flashcards generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 + # Log and return a specific readable error message when flashcards + # generation fails. + return f"Flashcards generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501 except Exception as exc: + # Log and return a specific readable error message when an unknown + # error occurs. logger.exception("Error processing content: %d", content_file.id) return ( - False, - f"Content file summarization failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n", # noqa: E501 + f"Summarization failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n", # noqa: E501 ) def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM: @@ -236,13 +240,16 @@ def _generate_summary(self, content: str, llm_model: str) -> str: logger.info("Generated summary: %s", generated_summary) except Exception as exc: + # We do not want to raise the exception as is, we will log the exception and + # raise SummaryGenerationError that will be used to make further decisions + # in the code. logger.exception( "An error occurred while generating summary using model: %s", llm_model ) raise SummaryGenerationError(exc) from exc else: - return True, generated_summary + return generated_summary def _generate_flashcards( self, content: str, llm_model: str @@ -264,11 +271,13 @@ def _generate_flashcards( generated_flashcards = response.get("flashcards") logger.info("Generated flashcards: %s", generated_flashcards) except Exception as exc: + # We do not want to raise the exception as is, we will log the exception and + # raise FlashcardsGenerationError that will be used to make further + # decisions in the code. logger.exception( "An error occurred while generating flashcards using model: %s", llm_model, ) raise FlashcardsGenerationError(exc) from exc - else: return generated_flashcards diff --git a/learning_resources/content_summarizer_test.py b/learning_resources/content_summarizer_test.py index a80e1a531c..6c81bb4fc9 100644 --- a/learning_resources/content_summarizer_test.py +++ b/learning_resources/content_summarizer_test.py @@ -6,6 +6,10 @@ PlatformType, ) from learning_resources.content_summarizer import ContentSummarizer +from learning_resources.exceptions import ( + FlashcardsGenerationError, + SummaryGenerationError, +) from learning_resources.factories import ( ContentFileFactory, ContentSummarizerConfigurationFactory, @@ -242,9 +246,9 @@ def test_get_unprocessed_content_files_with_platform_and_config( def test_summarize_content_files_by_ids( processable_content_files, mock_summarize_single_content_file ): - """The summarizer should process content files that are processable""" + """The summarizer should process content files that are processable and return the status results""" summarizer = ContentSummarizer() - summarizer.summarize_content_files_by_ids( + results = summarizer.summarize_content_files_by_ids( overwrite=False, content_file_ids=[ content_file.id for content_file in processable_content_files @@ -253,6 +257,8 @@ def test_summarize_content_files_by_ids( assert mock_summarize_single_content_file.call_count == len( processable_content_files ) + assert isinstance(results, list) + assert len(results) == len(processable_content_files) def test_summarize_single_content_file(mocker, processable_content_files): @@ -332,3 +338,77 @@ def test_process_single_file_calls_llm_summary( assert mock_instance.with_structured_output.call_count == 1 elif has_flashcards: assert mock_instance.invoke.call_count == 1 + + +@pytest.mark.parametrize( + ("process_type", "expected_exception"), + [("summary", SummaryGenerationError), ("flashcards", FlashcardsGenerationError)], +) +def test_generate_summary_flashcards_exception( + mocker, processable_content_files, settings, process_type, expected_exception +): + """Test the exception handling in the generate_summary and generate_flashcards methods""" + settings.OPENAI_API_KEY = "test" + summarizer = ContentSummarizer() + content_file = processable_content_files[0] + content_file.save() + + # Mock the ChatLiteLLM class and its methods + mock_chat_llm = mocker.patch( + "learning_resources.content_summarizer.ChatLiteLLM", autospec=True + ) + mock_instance = mock_chat_llm.return_value + + # Mock the response for _generate_summary to raise an exception + mock_instance.invoke.side_effect = Exception("Test exception") + # Mock the response for _generate_flashcards to raise an exception + mock_instance.with_structured_output.return_value.invoke.side_effect = Exception( + "INVALID_FORMAT" + ) + + if process_type == "summary": + with pytest.raises(expected_exception): + summarizer._generate_summary( # noqa: SLF001 + llm_model="llm_model", content=content_file.content + ) + else: + with pytest.raises(expected_exception): + summarizer._generate_flashcards( # noqa: SLF001 + llm_model="llm_model", content=content_file.content + ) + + +def test_summarize_single_content_file_with_exception( + mocker, processable_content_files, settings +): + """Test the exception handling in the summarize_single_content_file method""" + settings.OPENAI_API_KEY = "test" + summarizer = ContentSummarizer() + content_file = processable_content_files[0] + + # Mock the ChatLiteLLM class and its methods + mock_chat_llm = mocker.patch( + "learning_resources.content_summarizer.ChatLiteLLM", autospec=True + ) + mock_instance = mock_chat_llm.return_value + + # Mock the response for _generate_summary to raise an exception + mock_instance.invoke.side_effect = Exception("Test exception") + # Mock the response for _generate_flashcards to raise an exception + mock_instance.with_structured_output.return_value.invoke.side_effect = Exception( + "INVALID_FORMAT" + ) + + error = summarizer.summarize_single_content_file(content_file.id, overwrite=False) + assert ( + error + == f"Summary generation failed for CONTENT_FILE_ID: {content_file.id}\nError: Test exception\n\n" + ) + content_file.summary = "Test summary" + content_file.save() + content_file.refresh_from_db() + error = summarizer.summarize_single_content_file(content_file.id, overwrite=False) + assert ( + error + == f"Flashcards generation failed for CONTENT_FILE_ID: {content_file.id}\nError: INVALID_FORMAT\n\n" + )