Optimize Confluence API Usage with V1 Search Endpoint (#10906)

* Reduce number of Confluence API queries * ✨
dust-tt · Feb 18, 2025 · 2470e52 · 2470e52
1 parent cb34d1d
commit 2470e52
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 36 deletions.
diff --git a/connectors/src/connectors/confluence/lib/confluence_api.ts b/connectors/src/connectors/confluence/lib/confluence_api.ts
@@ -116,9 +116,11 @@ export async function pageHasReadRestrictions(
 }
 
 export interface ConfluencePageRef {
+  hasChildren: boolean;
+  hasReadRestrictions: boolean;
   id: string;
-  version: number;
   parentId: string | null;
+  version: number;
 }
 
 const PAGE_FETCH_LIMIT = 100;
@@ -172,19 +174,27 @@ export async function bulkFetchConfluencePageRefs(
     spaceId: string;
   }
 ) {
-  // Fetch the details of the pages (version and parentId).
+  // Fetch page metadata (version, parent, permissions, etc.) for the given page IDs
   const pagesWithDetails = await client.getPagesByIdsInSpace({
     spaceId,
-    sort: "id",
     pageIds,
     limit,
   });
 
-  const pageRefs: ConfluencePageRef[] = pagesWithDetails.pages.map((p) => ({
-    id: p.id,
-    version: p.version.number,
-    parentId: p.parentId,
-  }));
+  const pageRefs: ConfluencePageRef[] = pagesWithDetails.results.map((p) => {
+    const hasReadRestrictions =
+      p.restrictions.read.restrictions.group.results.length > 0 ||
+      p.restrictions.read.restrictions.user.results.length > 0;
+
+    return {
+      hasChildren: p.childTypes.page.value,
+      hasReadRestrictions,
+      id: p.id,
+      // Ancestors is an array of the page's ancestors, starting with the root page.
+      parentId: p.ancestors[p.ancestors.length - 1]?.id ?? null,
+      version: p.version.number,
+    };
+  });
 
   return pageRefs;
 }
diff --git a/connectors/src/connectors/confluence/lib/confluence_client.ts b/connectors/src/connectors/confluence/lib/confluence_client.ts
@@ -22,6 +22,7 @@ const ConfluenceAccessibleResourcesCodec = t.array(
 const ConfluenceSpaceCodec = t.intersection([
   t.type({
     id: t.string,
+    key: t.string,
     name: t.string,
     _links: t.type({
       webui: t.string,
@@ -57,6 +58,49 @@ const ConfluencePageCodec = t.intersection([
   CatchAllCodec,
 ]);
 
+const SearchConfluencePageCodec = t.intersection([
+  t.type({
+    id: t.string,
+    type: t.string,
+    status: t.string,
+    title: t.string,
+
+    // Version info.
+    version: t.type({
+      number: t.number,
+    }),
+
+    // Restrictions.
+    restrictions: t.type({
+      read: t.type({
+        restrictions: t.type({
+          user: t.type({
+            results: t.array(t.unknown),
+          }),
+          group: t.type({
+            results: t.array(t.unknown),
+          }),
+        }),
+      }),
+    }),
+
+    // Children info
+    childTypes: t.type({
+      page: t.type({
+        value: t.boolean,
+      }),
+    }),
+
+    // Ancestors (parent chain)
+    ancestors: t.array(
+      t.type({
+        id: t.string,
+      })
+    ),
+  }),
+  CatchAllCodec,
+]);
+
 const ConfluencePageWithBodyCodec = t.intersection([
   ConfluencePageCodec,
   t.type({
@@ -547,42 +591,37 @@ export class ConfluenceClient {
 
   async getPagesByIdsInSpace({
     spaceId,
-    sort,
-    pageCursor,
     pageIds,
     limit,
   }: {
     spaceId: string;
-    sort?: "id" | "-modified-date";
-    pageCursor?: string | null;
-    pageIds?: string[];
+    pageIds: string[];
     limit?: number;
   }) {
+    // First get space info to get the key.
+    // TODO(2025-02-18 flav) Save the key in the DB.
+    const space = await this.getSpaceById(spaceId);
+
+    // Build CQL query to get pages with specific IDs.
+    const idClause = pageIds?.length ? ` AND id in (${pageIds.join(",")})` : "";
+    const cqlQuery = `type=page AND space="${space.key}"${idClause}`;
+
     const params = new URLSearchParams({
-      sort: sort ?? "id",
+      cql: cqlQuery,
       limit: limit?.toString() ?? "25",
-      status: "current",
-      "space-id": spaceId,
+      expand: [
+        "version", // to check if page changed.
+        "restrictions.read.restrictions.user", // to check user permissions.
+        "restrictions.read.restrictions.group", // to check group permissions.
+        "childTypes.page", // to know if it has children.
+        "ancestors", // to get parent info.
+      ].join(","),
     });
 
-    if (pageCursor) {
-      params.append("cursor", pageCursor);
-    }
-
-    if (pageIds && pageIds.length > 0) {
-      params.append("id", pageIds.join(","));
-    }
-
-    const pages = await this.request(
-      `${this.restApiBaseUrl}/pages?${params.toString()}`,
-      ConfluencePaginatedResults(ConfluencePageCodec)
+    return this.request(
+      `${this.legacyRestApiBaseUrl}/content/search?${params.toString()}`,
+      ConfluencePaginatedResults(SearchConfluencePageCodec)
     );
-    const nextPageCursor = extractCursorFromLinks(pages._links);
-
-    return {
-      pages: pages.results,
-      nextPageCursor,
-    };
   }
 
   async getPageById(pageId: string) {

diff --git a/connectors/src/connectors/confluence/temporal/activities.ts b/connectors/src/connectors/confluence/temporal/activities.ts
@@ -431,7 +431,7 @@ export async function confluenceCheckAndUpsertPageActivity({
   );
 
   // Check restrictions.
-  const hasReadRestrictions = await pageHasReadRestrictions(client, pageId);
+  const { hasReadRestrictions } = pageRef;
   if (hasReadRestrictions) {
     localLogger.info("Skipping restricted Confluence page.");
     return false;
@@ -713,8 +713,6 @@ export async function fetchAndUpsertRootPagesActivity(params: {
     }
   }
 
-  console.log(">> allowedRootPageIds", allowedRootPageIds);
-
   return allowedRootPageIds;
 }
 

diff --git a/connectors/src/connectors/confluence/temporal/workflows.ts b/connectors/src/connectors/confluence/temporal/workflows.ts
@@ -280,6 +280,11 @@ export async function confluenceSyncTopLevelChildPagesWorkflow(
       }
     }
 
+    // Only attempt to fetch children if the page has known children.
+    if (isPageRef && !current.hasChildren) {
+      continue;
+    }
+
     // Get child pages using either initial empty cursor or saved cursor.
     const { childPageRefs, nextPageCursor } =
       await confluenceGetActiveChildPageRefsActivity({