Skip to content

Commit

Permalink
Bugfix/confluence time zone (#3265)
Browse files Browse the repository at this point in the history
* RedisLock typing

* checkpoint

* put in debug logging

* improve comments

* mypy fixes
  • Loading branch information
rkuo-danswer authored Dec 3, 2024
1 parent 9e9b7ed commit 5f28a1b
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 9 deletions.
11 changes: 6 additions & 5 deletions backend/danswer/background/celery/apps/app_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from celery.states import READY_STATES
from celery.utils.log import get_task_logger
from celery.worker import strategy # type: ignore
from redis.lock import Lock as RedisLock
from sentry_sdk.integrations.celery import CeleryIntegration
from sqlalchemy import text
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -332,16 +333,16 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
return

logger.info("Releasing primary worker lock.")
lock = sender.primary_worker_lock
lock: RedisLock = sender.primary_worker_lock
try:
if lock.owned():
try:
lock.release()
sender.primary_worker_lock = None
except Exception as e:
logger.error(f"Failed to release primary worker lock: {e}")
except Exception as e:
logger.error(f"Failed to check if primary worker lock is owned: {e}")
except Exception:
logger.exception("Failed to release primary worker lock")
except Exception:
logger.exception("Failed to check if primary worker lock is owned")


def on_setup_logging(
Expand Down
5 changes: 3 additions & 2 deletions backend/danswer/background/celery/apps/primary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from celery.signals import worker_init
from celery.signals import worker_ready
from celery.signals import worker_shutdown
from redis.lock import Lock as RedisLock

import danswer.background.celery.apps.app_base as app_base
from danswer.background.celery.apps.app_base import task_logger
Expand Down Expand Up @@ -116,7 +117,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
# it is planned to use this lock to enforce singleton behavior on the primary
# worker, since the primary worker does redis cleanup on startup, but this isn't
# implemented yet.
lock = r.lock(
lock: RedisLock = r.lock(
DanswerRedisLocks.PRIMARY_WORKER,
timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
)
Expand Down Expand Up @@ -227,7 +228,7 @@ def run_periodic_task(self, worker: Any) -> None:
if not hasattr(worker, "primary_worker_lock"):
return

lock = worker.primary_worker_lock
lock: RedisLock = worker.primary_worker_lock

r = get_redis_client(tenant_id=None)

Expand Down
16 changes: 16 additions & 0 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,22 @@
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
)

# Due to breakages in the confluence API, the timezone offset must be specified client side
# to match the user's specified timezone.

# The current state of affairs:
# CQL queries are parsed in the user's timezone and cannot be specified in UTC
# no API retrieves the user's timezone
# All data is returned in UTC, so we can't derive the user's timezone from that

# https://community.developer.atlassian.com/t/confluence-cloud-time-zone-get-via-rest-api/35954/16
# https://jira.atlassian.com/browse/CONFCLOUD-69670

# enter as a floating point offset from UTC in hours (-24 < val < 24)
# this will be applied globally, so it probably makes sense to transition this to per
# connector as some point.
CONFLUENCE_TIMEZONE_OFFSET = float(os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", 1.0))

JIRA_CONNECTOR_LABELS_TO_SKIP = [
ignored_tag
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
Expand Down
11 changes: 9 additions & 2 deletions backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import Any
from urllib.parse import quote

from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
Expand Down Expand Up @@ -69,6 +71,7 @@ def __init__(
# skip it. This is generally used to avoid indexing extra sensitive
# pages.
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
) -> None:
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
Expand Down Expand Up @@ -104,6 +107,8 @@ def __init__(
)
self.cql_label_filter = f" and label not in ({comma_separated_labels})"

self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))

@property
def confluence_client(self) -> OnyxConfluence:
if self._confluence_client is None:
Expand Down Expand Up @@ -204,12 +209,14 @@ def _fetch_document_batches(self) -> GenerateDocumentsOutput:
confluence_page_ids: list[str] = []

page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
logger.debug(f"page_query: {page_query}")
# Fetch pages as Documents
for page in self.confluence_client.paginated_cql_retrieval(
cql=page_query,
expand=",".join(_PAGE_EXPANSION_FIELDS),
limit=self.batch_size,
):
logger.debug(f"_fetch_document_batches: {page['id']}")
confluence_page_ids.append(page["id"])
doc = self._convert_object_to_document(page)
if doc is not None:
Expand Down Expand Up @@ -242,10 +249,10 @@ def load_from_state(self) -> GenerateDocumentsOutput:

def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
# Add time filters
formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
formatted_start_time = datetime.fromtimestamp(start, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M"
)
formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M"
)
self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"
Expand Down
33 changes: 33 additions & 0 deletions backend/danswer/connectors/confluence/onyx_confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,32 @@ def __init__(self, url: str, *args: Any, **kwargs: Any) -> None:
super(OnyxConfluence, self).__init__(url, *args, **kwargs)
self._wrap_methods()

def get_current_user(self, expand: str | None = None) -> Any:
"""
Implements a method that isn't in the third party client.
Get information about the current user
:param expand: OPTIONAL expand for get status of user.
Possible param is "status". Results are "Active, Deactivated"
:return: Returns the user details
"""

from atlassian.errors import ApiPermissionError # type:ignore

url = "rest/api/user/current"
params = {}
if expand:
params["expand"] = expand
try:
response = self.get(url, params=params)
except HTTPError as e:
if e.response.status_code == 403:
raise ApiPermissionError(
"The calling user does not have permission", reason=e
)
raise
return response

def _wrap_methods(self) -> None:
"""
For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
Expand Down Expand Up @@ -306,6 +332,13 @@ def _validate_connector_configuration(
)
spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)

# uncomment the following for testing
# the following is an attempt to retrieve the user's timezone
# Unfornately, all data is returned in UTC regardless of the user's time zone
# even tho CQL parses incoming times based on the user's time zone
# space_key = spaces["results"][0]["key"]
# space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")

if not spaces:
raise RuntimeError(
f"No spaces found at {wiki_base}! "
Expand Down

0 comments on commit 5f28a1b

Please sign in to comment.