From cea50d5f1057c025a5db15dcffc8f3e7d190a447 Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Mon, 30 Sep 2024 09:57:41 -0700 Subject: [PATCH] Add size limit to jira tickets --- backend/danswer/configs/app_configs.py | 4 + .../connectors/danswer_jira/connector.py | 13 +- .../jira/test_large_ticket_handling.py | 136 ++++++++++++++++++ 3 files changed, 151 insertions(+), 2 deletions(-) create mode 100644 backend/tests/unit/danswer/connectors/jira/test_large_ticket_handling.py diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index d44e2de78e0..5a29783dc3e 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -247,6 +247,10 @@ for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",") if ignored_tag ] +# Maximum size for Jira tickets in bytes (default: 100KB) +JIRA_CONNECTOR_MAX_TICKET_SIZE = int( + os.environ.get("JIRA_CONNECTOR_MAX_TICKET_SIZE", 100 * 1024) +) GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME") diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index e3562f3a45c..05fa2e1e24d 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -9,6 +9,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP +from danswer.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -134,10 +135,18 @@ def fetch_jira_issues_batch( else extract_text_from_adf(jira.raw["fields"]["description"]) ) comments = _get_comment_strs(jira, comment_email_blacklist) - semantic_rep = f"{description}\n" + "\n".join( + ticket_content = f"{description}\n" + "\n".join( [f"Comment: {comment}" for comment in comments if comment] ) + # Check ticket size + if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE: + logger.info( + f"Skipping {jira.key} because it exceeds the maximum size of " + f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes." + ) + continue + page_url = f"{jira_client.client_info()}/browse/{jira.key}" people = set() @@ -180,7 +189,7 @@ def fetch_jira_issues_batch( doc_batch.append( Document( id=page_url, - sections=[Section(link=page_url, text=semantic_rep)], + sections=[Section(link=page_url, text=ticket_content)], source=DocumentSource.JIRA, semantic_identifier=jira.fields.summary, doc_updated_at=time_str_to_utc(jira.fields.updated), diff --git a/backend/tests/unit/danswer/connectors/jira/test_large_ticket_handling.py b/backend/tests/unit/danswer/connectors/jira/test_large_ticket_handling.py new file mode 100644 index 00000000000..d025e5d55df --- /dev/null +++ b/backend/tests/unit/danswer/connectors/jira/test_large_ticket_handling.py @@ -0,0 +1,136 @@ +from collections.abc import Callable +from collections.abc import Generator +from typing import Any +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest +from jira.resources import Issue +from pytest_mock import MockFixture + +from danswer.connectors.danswer_jira.connector import fetch_jira_issues_batch + + +@pytest.fixture +def mock_jira_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def mock_issue_small() -> MagicMock: + issue = MagicMock() + issue.key = "SMALL-1" + issue.fields.description = "Small description" + issue.fields.comment.comments = [ + MagicMock(body="Small comment 1"), + MagicMock(body="Small comment 2"), + ] + issue.fields.creator.displayName = "John Doe" + issue.fields.creator.emailAddress = "john@example.com" + issue.fields.summary = "Small Issue" + issue.fields.updated = "2023-01-01T00:00:00+0000" + issue.fields.labels = [] + return issue + + +@pytest.fixture +def mock_issue_large() -> MagicMock: + # This will be larger than 100KB + issue = MagicMock() + issue.key = "LARGE-1" + issue.fields.description = "a" * 99_000 + issue.fields.comment.comments = [ + MagicMock(body="Large comment " * 1000), + MagicMock(body="Another large comment " * 1000), + ] + issue.fields.creator.displayName = "Jane Doe" + issue.fields.creator.emailAddress = "jane@example.com" + issue.fields.summary = "Large Issue" + issue.fields.updated = "2023-01-02T00:00:00+0000" + issue.fields.labels = [] + return issue + + +@pytest.fixture +def patched_type() -> Callable[[Any], type]: + def _patched_type(obj: Any) -> type: + if isinstance(obj, MagicMock): + return Issue + return type(obj) + + return _patched_type + + +@pytest.fixture +def mock_jira_api_version() -> Generator[Any, Any, Any]: + with patch("danswer.connectors.danswer_jira.connector.JIRA_API_VERSION", "2"): + yield + + +@pytest.fixture +def patched_environment( + patched_type: type, + mock_jira_api_version: MockFixture, +) -> Generator[Any, Any, Any]: + with patch("danswer.connectors.danswer_jira.connector.type", patched_type): + yield + + +def test_fetch_jira_issues_batch_small_ticket( + mock_jira_client: MagicMock, + mock_issue_small: MagicMock, + patched_environment: MockFixture, +) -> None: + mock_jira_client.search_issues.return_value = [mock_issue_small] + + docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client) + + assert count == 1 + assert len(docs) == 1 + assert docs[0].id.endswith("/SMALL-1") + assert "Small description" in docs[0].sections[0].text + assert "Small comment 1" in docs[0].sections[0].text + assert "Small comment 2" in docs[0].sections[0].text + + +def test_fetch_jira_issues_batch_large_ticket( + mock_jira_client: MagicMock, + mock_issue_large: MagicMock, + patched_environment: MockFixture, +) -> None: + mock_jira_client.search_issues.return_value = [mock_issue_large] + + docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client) + + assert count == 1 + assert len(docs) == 0 # The large ticket should be skipped + + +def test_fetch_jira_issues_batch_mixed_tickets( + mock_jira_client: MagicMock, + mock_issue_small: MagicMock, + mock_issue_large: MagicMock, + patched_environment: MockFixture, +) -> None: + mock_jira_client.search_issues.return_value = [mock_issue_small, mock_issue_large] + + docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client) + + assert count == 2 + assert len(docs) == 1 # Only the small ticket should be included + assert docs[0].id.endswith("/SMALL-1") + + +@patch("danswer.connectors.danswer_jira.connector.JIRA_CONNECTOR_MAX_TICKET_SIZE", 50) +def test_fetch_jira_issues_batch_custom_size_limit( + mock_jira_client: MagicMock, + mock_issue_small: MagicMock, + mock_issue_large: MagicMock, + patched_environment: MockFixture, +) -> None: + mock_jira_client.search_issues.return_value = [mock_issue_small, mock_issue_large] + + docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client) + + assert count == 2 + assert len(docs) == 0 # Both tickets should be skipped due to the low size limit