From 0a42e8df46d5d0ef27da00817663700724136157 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 30 Aug 2023 13:33:36 +0200 Subject: [PATCH 01/32] doc update of parents field --- core/src/data_sources/data_source.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index 8cf4574c1edf..7bd5aeeff6a3 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -92,13 +92,19 @@ pub struct Chunk { /// The "parents" field is an array of ids of parents to the document, /// corresponding to its hierarchy, ordered by closest parent first. /// -/// At index 0 is the document's direct parent, then at index 1 is the direct -/// parent of the element represented at index 0, etc. It is assumed that a -/// document (or folder, or hierarchical level) only has at most one direct -/// parent. Therefore, there is an unambiguous mapping between the parents array -/// and the document's hierarchical position. For example, for a regular file -/// system (or filesystem-like such as Google Drive), each parent would -/// correspond to a subfolder in the path to the document. +/// At index 0 is the document id itself, then at index 1 its direct parent, +/// then at index 2 is the direct parent of the element represented at index 1, +/// etc. It is assumed that a document (or folder, or hierarchical level) only +/// has at most one direct parent. Therefore, there is an unambiguous mapping +/// between the parents array and the document's hierarchical position. For +/// example, for a regular file system (or filesystem-like such as Google +/// Drive), each parent would correspond to a subfolder in the path to the +/// document. +/// +/// The document’s id is stored in the field, since the field is used in +/// filtering search to search only parts of the hierarchy: it is natural that +/// if the document’s id is selected as a parent filter, the document itself +/// shows up in the search. /// /// Note however that the hierarchical system depends on the managed datasource. /// For example, in the Slack managed datasource, documents only have a single From 605fbe3056392d408cf71155bba73b6861ac8e13 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 30 Aug 2023 16:40:06 +0200 Subject: [PATCH 02/32] core api to update parents (+ refactor of update tags) --- core/bin/dust_api.rs | 138 +++++++++++++++++++-------- core/src/data_sources/data_source.rs | 46 ++++++++- core/src/stores/postgres.rs | 64 +++++++++---- core/src/stores/store.rs | 7 ++ 4 files changed, 194 insertions(+), 61 deletions(-) diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs index 008acf180f10..04c3c036a887 100644 --- a/core/bin/dust_api.rs +++ b/core/bin/dust_api.rs @@ -13,8 +13,9 @@ use axum::{ use dust::{ app, blocks::block::BlockType, - data_sources::data_source::{self, SearchFilter}, - dataset, project, + data_sources::data_source::{self, DataSource, SearchFilter}, + dataset, + project::{self, Project}, providers::provider::{provider, ProviderID}, run, stores::postgres, @@ -1199,6 +1200,7 @@ async fn data_sources_documents_update_tags( }; let add_tags_set: HashSet = add_tags.iter().cloned().collect(); let remove_tags_set: HashSet = remove_tags.iter().cloned().collect(); + if add_tags_set.intersection(&remove_tags_set).count() > 0 { return ( StatusCode::BAD_REQUEST, @@ -1213,12 +1215,96 @@ async fn data_sources_documents_update_tags( }), ); } + + match load_data_source(&state, &project, &data_source_id).await { + Ok(ds) => match ds + .update_tags( + state.store.clone(), + state.qdrant_client.clone(), + document_id, + add_tags_set.into_iter().collect(), + remove_tags_set.into_iter().collect(), + ) + .await + { + Err(e) => server_error_response("Failed to update document tags", e), + Ok(_) => datasource_ok_response(ds), + }, + Err(api_error) => api_error, + } +} + +/// Update parents of a document in a data source. + +#[derive(serde::Deserialize)] +struct DataSourcesDocumentsUpdateParentsPayload { + parents: Vec, +} + +async fn data_sources_documents_update_parents( + extract::Path((project_id, data_source_id, document_id)): extract::Path<(i64, String, String)>, + extract::Json(payload): extract::Json, + extract::Extension(state): extract::Extension>, +) -> (StatusCode, Json) { + let project = project::Project::new_from_id(project_id); + + match load_data_source(&state, &project, &data_source_id).await { + Ok(ds) => match ds + .update_parents( + state.store.clone(), + state.qdrant_client.clone(), + document_id, + payload.parents, + ) + .await + { + Err(e) => server_error_response("Failed to update document parents", e), + Ok(_) => datasource_ok_response(ds), + }, + Err(api_error) => api_error, + } +} + +fn server_error_response(message: &str, e: anyhow::Error) -> (StatusCode, Json) { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(APIResponse { + error: Some(APIError { + code: String::from("internal_server_error"), + message: format!("{}: {}", message, e), + }), + response: None, + }), + ) +} + +fn datasource_ok_response(ds: DataSource) -> (StatusCode, Json) { + ( + StatusCode::OK, + Json(APIResponse { + error: None, + response: Some(json!({ + "data_source": { + "created": ds.created(), + "data_source_id": ds.data_source_id(), + "config": ds.config(), + }, + })), + }), + ) +} + +async fn load_data_source( + state: &Arc, + project: &Project, + data_source_id: &str, +) -> Result)> { match state .store .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => Err(( StatusCode::INTERNAL_SERVER_ERROR, Json(APIResponse { error: Some(APIError { @@ -1227,9 +1313,9 @@ async fn data_sources_documents_update_tags( }), response: None, }), - ), + )), Ok(ds) => match ds { - None => ( + None => Err(( StatusCode::NOT_FOUND, Json(APIResponse { error: Some(APIError { @@ -1238,45 +1324,11 @@ async fn data_sources_documents_update_tags( }), response: None, }), - ), - Some(ds) => match ds - .update_tags( - state.store.clone(), - state.qdrant_client.clone(), - document_id, - add_tags_set.into_iter().collect(), - remove_tags_set.into_iter().collect(), - ) - .await - { - Err(e) => ( - StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to update document tags: {}", e), - }), - response: None, - }), - ), - Ok(_) => ( - StatusCode::OK, - Json(APIResponse { - error: None, - response: Some(json!({ - "data_source": { - "created": ds.created(), - "data_source_id": ds.data_source_id(), - "config": ds.config(), - }, - })), - }), - ), - }, + )), + Some(ds) => Ok(ds), }, } } - // List versions of a document in a data source. #[derive(serde::Deserialize)] @@ -1840,6 +1892,10 @@ fn main() { "/projects/:project_id/data_sources/:data_source_id/documents/:document_id/tags", patch(data_sources_documents_update_tags), ) + .route( + "/projects/:project_id/data_sources/:data_source_id/documents/:document_id/parents", + patch(data_sources_documents_update_parents), + ) // Provided by the data_source block. .route( "/projects/:project_id/data_sources/:data_source_id/search", diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index 7bd5aeeff6a3..cba0c0101080 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -395,6 +395,32 @@ impl DataSource { Ok(()) } + pub async fn update_parents( + &self, + store: Box, + qdrant_client: Arc, + document_id: String, + parents: Vec, + ) -> Result<()> { + let new_parents = store + .update_data_source_document_parents( + &self.project, + &self.data_source_id(), + &document_id.to_string(), + &parents, + ) + .await?; + + self.upsert_field_to_document_payload( + qdrant_client, + document_id, + "parents", + new_parents.clone(), + ) + .await?; + Ok(()) + } + pub async fn update_tags( &self, store: Box, @@ -412,8 +438,21 @@ impl DataSource { &remove_tags, ) .await?; + self.upsert_field_to_document_payload(qdrant_client, document_id, "tags", new_tags.clone()) + .await?; + Ok(new_tags) + } + + async fn upsert_field_to_document_payload( + &self, + qdrant_client: Arc, + document_id: String, + field_name: &str, + field_value: impl Into, + ) -> Result<()> { let mut payload = Payload::new(); - payload.insert("tags", new_tags.clone()); + payload.insert(field_name, field_value.into()); + let field_condition = qdrant::FieldCondition { key: "document_id".to_string(), r#match: Some(qdrant::Match { @@ -421,12 +460,14 @@ impl DataSource { }), ..Default::default() }; + let points_selector = PointsSelector { points_selector_one_of: Some(PointsSelectorOneOf::Filter(Filter { must: vec![field_condition.into()], ..Default::default() })), }; + qdrant_client .set_payload( self.qdrant_collection().to_string(), @@ -435,8 +476,7 @@ impl DataSource { None, ) .await?; - - Ok(new_tags) + Ok(()) } pub async fn upsert( diff --git a/core/src/stores/postgres.rs b/core/src/stores/postgres.rs index 298962248adf..7933108a0ccf 100644 --- a/core/src/stores/postgres.rs +++ b/core/src/stores/postgres.rs @@ -13,7 +13,7 @@ use crate::stores::store::{Store, POSTGRES_TABLES, SQL_INDEXES}; use crate::utils; use anyhow::{anyhow, Result}; use async_trait::async_trait; -use bb8::Pool; +use bb8::{Pool, PooledConnection}; use bb8_postgres::PostgresConnectionManager; use serde_json::Value; use std::collections::{HashMap, HashSet}; @@ -1134,6 +1134,30 @@ impl Store for PostgresStore { } } + async fn update_data_source_document_parents( + &self, + project: &Project, + data_source_id: &str, + document_id: &str, + parents: &Vec, + ) -> Result<()> { + let document_id = document_id.to_string(); + let pool = self.pool.clone(); + let mut c = pool.get().await?; + let data_source_row_id = + get_data_source_row_id(project.project_id(), data_source_id.to_string(), &mut c) + .await?; + + c.execute( + "UPDATE data_sources_documents SET parents = $1 \ + WHERE data_source = $2 AND document_id = $3 AND status = 'latest'", + &[&parents, &data_source_row_id, &document_id], + ) + .await?; + + Ok(()) + } + async fn update_data_source_document_tags( &self, project: &Project, @@ -1142,25 +1166,12 @@ impl Store for PostgresStore { add_tags: &Vec, remove_tags: &Vec, ) -> Result> { - let project_id = project.project_id(); - let data_source_id = data_source_id.to_string(); let document_id = document_id.to_string(); - let pool = self.pool.clone(); let mut c = pool.get().await?; - - let r = c - .query( - "SELECT id FROM data_sources WHERE project = $1 AND data_source_id = $2 LIMIT 1", - &[&project_id, &data_source_id], - ) - .await?; - - let data_source_row_id: i64 = match r.len() { - 0 => Err(anyhow!("Unknown DataSource: {}", data_source_id))?, - 1 => r[0].get(0), - _ => unreachable!(), - }; + let data_source_row_id = + get_data_source_row_id(project.project_id(), data_source_id.to_string(), &mut c) + .await?; let tx = c.transaction().await?; @@ -1974,3 +1985,22 @@ impl Store for PostgresStore { Box::new(self.clone()) } } + +async fn get_data_source_row_id( + project_id: i64, + data_source_id: String, + connection: &mut PooledConnection<'_, PostgresConnectionManager>, +) -> Result { + let r = connection + .query( + "SELECT id FROM data_sources WHERE project = $1 AND data_source_id = $2 LIMIT 1", + &[&project_id, &data_source_id], + ) + .await?; + + match r.len() { + 0 => Err(anyhow!("Unknown DataSource: {}", data_source_id))?, + 1 => Ok(r[0].get(0)), + _ => unreachable!(), + } +} diff --git a/core/src/stores/store.rs b/core/src/stores/store.rs index 4282e165d60e..cf2bffbef4e7 100644 --- a/core/src/stores/store.rs +++ b/core/src/stores/store.rs @@ -114,6 +114,13 @@ pub trait Store { add_tags: &Vec, remove_tags: &Vec, ) -> Result>; + async fn update_data_source_document_parents( + &self, + project: &Project, + data_source_id: &str, + document_id: &str, + parents: &Vec, + ) -> Result<()>; async fn list_data_source_document_versions( &self, project: &Project, From 8004a790b93fb4a70f285d3c3186cc5150fb7c8b Mon Sep 17 00:00:00 2001 From: philipperolet Date: Thu, 31 Aug 2023 17:49:44 +0200 Subject: [PATCH 03/32] spolu review --- core/bin/dust_api.rs | 852 +++++++++++---------------- core/src/data_sources/data_source.rs | 13 +- core/src/stores/postgres.rs | 58 +- 3 files changed, 379 insertions(+), 544 deletions(-) diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs index 04c3c036a887..98145eab399a 100644 --- a/core/bin/dust_api.rs +++ b/core/bin/dust_api.rs @@ -13,9 +13,9 @@ use axum::{ use dust::{ app, blocks::block::BlockType, - data_sources::data_source::{self, DataSource, SearchFilter}, + data_sources::data_source::{self, SearchFilter}, dataset, - project::{self, Project}, + project::{self}, providers::provider::{provider, ProviderID}, run, stores::postgres, @@ -142,15 +142,11 @@ async fn projects_create( extract::Extension(state): extract::Extension>, ) -> (StatusCode, Json) { match state.store.create_project().await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to create a new project: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to create a new project", + Some(e), ), Ok(project) => ( StatusCode::OK, @@ -176,15 +172,11 @@ async fn projects_clone( // Create cloned project let project = match state.store.create_project().await { Err(e) => { - return ( + return error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to create cloned project: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to create cloned project", + Some(e), ) } Ok(project) => project, @@ -193,15 +185,11 @@ async fn projects_clone( // Retrieve datasets let datasets = match state.store.list_datasets(&cloned).await { Err(e) => { - return ( + return error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to list cloned project datasets: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to list cloned project datasets", + Some(e), ) } Ok(datasets) => datasets, @@ -228,15 +216,11 @@ async fn projects_clone( .collect::>>() { Err(e) => { - return ( + return error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to clone project datasets: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to clone project datasets", + Some(e), ) } Ok(_) => (), @@ -266,15 +250,11 @@ async fn specifications_check( ) -> (StatusCode, Json) { let _project = project::Project::new_from_id(project_id); match app::App::new(&payload.specification).await { - Err(e) => ( + Err(e) => error_response( StatusCode::BAD_REQUEST, - Json(APIResponse { - error: Some(APIError { - code: String::from("invalid_specification_error"), - message: e.to_string(), - }), - response: None, - }), + "invalid_specification_error", + "Invalid specification", + Some(e), ), Ok(app) => ( StatusCode::OK, @@ -299,26 +279,18 @@ async fn specifications_retrieve( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.load_specification(&project, &hash).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve specification: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve specification", + Some(e), ), Ok(s) => match s { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("specification_not_found"), - message: format!("No specification found with hash `{}`", hash), - }), - response: None, - }), + "specification_not_found", + &format!("No specification found with hash `{}`", hash), + None, ), Some((created, spec)) => ( StatusCode::OK, @@ -351,15 +323,11 @@ async fn datasets_register( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match dataset::Dataset::new_from_jsonl(&payload.dataset_id, payload.data).await { - Err(e) => ( + Err(e) => error_response( StatusCode::BAD_REQUEST, - Json(APIResponse { - error: Some(APIError { - code: String::from("invalid_dataset_error"), - message: e.to_string(), - }), - response: None, - }), + "invalid_dataset_error", + "Invalid dataset", + Some(e), ), Ok(d) => { // First retrieve the latest hash of the dataset to avoid registering if it matches the @@ -374,15 +342,11 @@ async fn datasets_register( }; if !(current_hash.is_some() && current_hash.unwrap() == d.hash()) { match state.store.register_dataset(&project, &d).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to store dataset: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to store dataset", + Some(e), ), Ok(()) => ( StatusCode::OK, @@ -425,15 +389,11 @@ async fn datasets_list( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.list_datasets(&project).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to list datasets: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to list datasets", + Some(e), ), Ok(datasets) => { let datasets = datasets @@ -471,29 +431,21 @@ async fn datasets_retrieve( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.load_dataset(&project, &dataset_id, &hash).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve dataset: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve dataset", + Some(e), ), Ok(dataset) => match dataset { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("dataset_not_found"), - message: format!( - "No dataset found for id `{}` and hash `{}`", - dataset_id, hash - ), - }), - response: None, - }), + "dataset_not_found", + &format!( + "No dataset found for id `{}` and hash `{}`", + dataset_id, hash, + ), + None, ), Some(d) => ( StatusCode::OK, @@ -523,7 +475,7 @@ async fn run_helper( project_id: i64, payload: RunsCreatePayload, state: Arc, -) -> Result { +) -> Result)> { let project = project::Project::new_from_id(project_id); let mut register_spec = true; @@ -531,20 +483,18 @@ async fn run_helper( Some(spec) => spec, None => match payload.specification_hash { Some(hash) => match state.store.load_specification(&project, &hash).await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::INTERNAL_SERVER_ERROR, - APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve specification: {}", e), - }, + "internal_server_error", + "Failed to retrieve specification", + Some(e), ))?, Ok(spec) => match spec { - None => Err(( + None => Err(error_response( StatusCode::NOT_FOUND, - APIError { - code: String::from("specification_not_found"), - message: format!("No specification found for hash `{}`", hash), - }, + "specification_not_found", + &format!("No specification found for hash `{}`", hash), + None, ))?, Some((_, s)) => { register_spec = false; @@ -552,26 +502,22 @@ async fn run_helper( } }, }, - None => Err(( + None => Err(error_response( StatusCode::BAD_REQUEST, - APIError { - code: String::from("missing_specification_error"), - message: String::from( - "No specification provided, \ - either `specification` or `specification_hash` must be provided", - ), - }, + "missing_specification_error", + "No specification provided, either `specification` or + `specification_hash` must be provided", + None, ))?, }, }; let mut app = match app::App::new(&specification).await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::BAD_REQUEST, - APIError { - code: String::from("invalid_specification_error"), - message: e.to_string(), - }, + "invalid_specification_error", + "Invalid specification", + Some(e), ))?, Ok(app) => app, }; @@ -579,31 +525,28 @@ async fn run_helper( let mut d = match payload.dataset_id.as_ref() { None => None, Some(dataset_id) => match state.store.latest_dataset_hash(&project, dataset_id).await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::INTERNAL_SERVER_ERROR, - APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve dataset: {}", e), - }, + "internal_server_error", + "Failed to retrieve dataset", + Some(e), ))?, - Ok(None) => Err(( + Ok(None) => Err(error_response( StatusCode::NOT_FOUND, - APIError { - code: String::from("dataset_not_found"), - message: format!("No dataset found for id `{}`", dataset_id), - }, + "dataset_not_found", + &format!("No dataset found for id `{}`", dataset_id), + None, ))?, Ok(Some(latest)) => match state .store .load_dataset(&project, dataset_id, &latest) .await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::INTERNAL_SERVER_ERROR, - APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve dataset: {}", e), - }, + "internal_server_error", + "Failed to retrieve dataset", + Some(e), ))?, Ok(d) => match d { None => unreachable!(), @@ -615,27 +558,23 @@ async fn run_helper( if d.is_some() { if payload.run_type != run::RunType::Local { - Err(( + Err(error_response( StatusCode::BAD_REQUEST, - APIError { - code: String::from("invalid_run_type_error"), - message: String::from( - "RunType `local` is expected when a `dataset_id` is provided", - ), - }, + "invalid_run_type_error", + "RunType `local` is expected when a `dataset_id` is provided", + None, ))? } if d.as_ref().unwrap().len() == 0 { - Err(( + Err(error_response( StatusCode::BAD_REQUEST, - APIError { - code: String::from("dataset_empty_error"), - message: format!( - "Dataset `{}` has 0 record", - payload.dataset_id.as_ref().unwrap() - ), - }, + "dataset_empty_error", + &format!( + "Dataset `{}` has 0 record", + payload.dataset_id.as_ref().unwrap() + ), + None, ))? } @@ -651,12 +590,11 @@ async fn run_helper( if payload.inputs.is_some() { d = match dataset::Dataset::new_from_jsonl("inputs", payload.inputs.unwrap()).await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::BAD_REQUEST, - APIError { - code: String::from("invalid_inputs_error"), - message: e.to_string(), - }, + "invalid_inputs_error", + "Invalid inputs", + Some(e), ))?, Ok(d) => Some(d), }; @@ -671,12 +609,11 @@ async fn run_helper( .register_specification(&project, &app.hash(), &specification) .await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::INTERNAL_SERVER_ERROR, - APIError { - code: String::from("internal_server_error"), - message: format!("Failed to register specification: {}", e), - }, + "internal_server_error", + "Failed to register specification", + Some(e), ))?, Ok(_) => (), } @@ -692,12 +629,11 @@ async fn run_helper( ) .await { - Err(e) => Err(( + Err(e) => Err(error_response( StatusCode::INTERNAL_SERVER_ERROR, - APIError { - code: String::from("internal_server_error"), - message: format!("Failed prepare run: {}", e), - }, + "internal_server_error", + "Failed prepare run", + Some(e), ))?, Ok(()) => (), } @@ -738,13 +674,7 @@ async fn runs_create( }), ) } - Err((status_code, api_error)) => ( - status_code, - Json(APIResponse { - error: Some(api_error), - response: None, - }), - ), + Err(err) => err, } } @@ -798,13 +728,15 @@ async fn runs_create_stream( }); } Err((_, api_error)) => { - let _ = tx.send(json!({ - "type": "error", - "content": { - "code": api_error.code, - "message": api_error.message, - }, - })); + if let Some(error) = &api_error.0.error { + let _ = tx.send(json!({ + "type": "error", + "content": { + "code": error.code, + "message": error.message, + }, + })); + } } } @@ -849,15 +781,11 @@ async fn runs_list( .list_runs(&project, query.run_type, Some((query.limit, query.offset))) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to list runs: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to list runs", + Some(e), ), Ok((runs, total)) => ( StatusCode::OK, @@ -886,15 +814,11 @@ async fn runs_retrieve_batch( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.load_runs(&project, payload.run_ids).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve runs: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve runs", + Some(e), ), Ok(runs) => ( StatusCode::OK, @@ -914,26 +838,18 @@ async fn runs_retrieve( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.load_run(&project, &run_id, None).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve run: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve run", + Some(e), ), Ok(run) => match run { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("run_not_found"), - message: format!("No run found for id `{}`", run_id), - }), - response: None, - }), + "run_not_found", + &format!("No run found for id `{}`", run_id), + None, ), Some(run) => ( StatusCode::OK, @@ -963,26 +879,18 @@ async fn runs_retrieve_block( .load_run(&project, &run_id, Some(Some((block_type, block_name)))) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve run: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve run", + Some(e), ), Ok(run) => match run { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("run_not_found"), - message: format!("No run found for id `{}`", run_id), - }), - response: None, - }), + "run_not_found", + &format!("No run found for id `{}`", run_id), + None, ), Some(run) => ( StatusCode::OK, @@ -1003,26 +911,18 @@ async fn runs_retrieve_status( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); match state.store.load_run(&project, &run_id, Some(None)).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve run: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve run", + Some(e), ), Ok(run) => match run { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("run_not_found"), - message: format!("No run found for id `{}`", run_id), - }), - response: None, - }), + "run_not_found", + &format!("No run found for id `{}`", run_id), + None, ), Some(run) => ( StatusCode::OK, @@ -1057,26 +957,18 @@ async fn data_sources_register( .setup(payload.credentials, state.qdrant_client.clone()) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to register data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to register data source", + Some(e), ), Ok(()) => match state.store.register_data_source(&project, &ds).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to register data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to register data source", + Some(e), ), Ok(()) => ( StatusCode::OK, @@ -1118,26 +1010,18 @@ async fn data_sources_search( .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve data source", + Some(e), ), Ok(ds) => match ds { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, ), Some(ds) => match ds .search( @@ -1161,15 +1045,11 @@ async fn data_sources_search( })), }), ), - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to perform the search: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to perform the search", + Some(e), ), }, }, @@ -1202,35 +1082,62 @@ async fn data_sources_documents_update_tags( let remove_tags_set: HashSet = remove_tags.iter().cloned().collect(); if add_tags_set.intersection(&remove_tags_set).count() > 0 { - return ( + return error_response( StatusCode::BAD_REQUEST, - Json(APIResponse { - error: Some(APIError { - code: String::from("bad_request"), - message: String::from( - "The `add_tags` and `remove_tags` lists have a non-empty intersection.", - ), - }), - response: None, - }), + "bad_request", + "The `add_tags` and `remove_tags` lists have a non-empty intersection.", + None, ); } - - match load_data_source(&state, &project, &data_source_id).await { - Ok(ds) => match ds - .update_tags( - state.store.clone(), - state.qdrant_client.clone(), - document_id, - add_tags_set.into_iter().collect(), - remove_tags_set.into_iter().collect(), - ) - .await - { - Err(e) => server_error_response("Failed to update document tags", e), - Ok(_) => datasource_ok_response(ds), + match state + .store + .load_data_source(&project, &data_source_id) + .await + { + Err(e) => error_response( + StatusCode::INTERNAL_SERVER_ERROR, + "internal_server_error", + "Failed to retrieve data source", + Some(e), + ), + Ok(ds) => match ds { + None => error_response( + StatusCode::NOT_FOUND, + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, + ), + Some(ds) => match ds + .update_tags( + state.store.clone(), + state.qdrant_client.clone(), + document_id, + add_tags_set.into_iter().collect(), + remove_tags_set.into_iter().collect(), + ) + .await + { + Err(e) => error_response( + StatusCode::INTERNAL_SERVER_ERROR, + "internal_server_error", + "Failed to update document tags", + Some(e), + ), + Ok(_) => ( + StatusCode::OK, + Json(APIResponse { + error: None, + response: Some(json!({ + "data_source": { + "created": ds.created(), + "data_source_id": ds.data_source_id(), + "config": ds.config(), + }, + })), + }), + ), + }, }, - Err(api_error) => api_error, } } @@ -1248,87 +1155,78 @@ async fn data_sources_documents_update_parents( ) -> (StatusCode, Json) { let project = project::Project::new_from_id(project_id); - match load_data_source(&state, &project, &data_source_id).await { - Ok(ds) => match ds - .update_parents( - state.store.clone(), - state.qdrant_client.clone(), - document_id, - payload.parents, - ) - .await - { - Err(e) => server_error_response("Failed to update document parents", e), - Ok(_) => datasource_ok_response(ds), + match state + .store + .load_data_source(&project, &data_source_id) + .await + { + Err(e) => error_response( + StatusCode::INTERNAL_SERVER_ERROR, + "internal_server_error", + "Failed to retrieve data source", + Some(e), + ), + Ok(ds) => match ds { + None => error_response( + StatusCode::NOT_FOUND, + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, + ), + Some(ds) => match ds + .update_parents( + state.store.clone(), + state.qdrant_client.clone(), + document_id, + payload.parents, + ) + .await + { + Err(e) => error_response( + StatusCode::INTERNAL_SERVER_ERROR, + "internal_server_error", + "Failed to update document parents", + Some(e), + ), + Ok(_) => ( + StatusCode::OK, + Json(APIResponse { + error: None, + response: Some(json!({ + "data_source": { + "created": ds.created(), + "data_source_id": ds.data_source_id(), + "config": ds.config(), + }, + })), + }), + ), + }, }, - Err(api_error) => api_error, } } -fn server_error_response(message: &str, e: anyhow::Error) -> (StatusCode, Json) { +fn error_response( + status: StatusCode, + code: &str, + message: &str, + error: Option, +) -> (StatusCode, Json) { ( - StatusCode::INTERNAL_SERVER_ERROR, + status, Json(APIResponse { error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("{}: {}", message, e), + code: code.to_string(), + message: match error { + Some(err) => format!("{}\nError: {}", message, err), + None => message.to_string(), + }, }), response: None, }), ) } -fn datasource_ok_response(ds: DataSource) -> (StatusCode, Json) { - ( - StatusCode::OK, - Json(APIResponse { - error: None, - response: Some(json!({ - "data_source": { - "created": ds.created(), - "data_source_id": ds.data_source_id(), - "config": ds.config(), - }, - })), - }), - ) -} - -async fn load_data_source( - state: &Arc, - project: &Project, - data_source_id: &str, -) -> Result)> { - match state - .store - .load_data_source(&project, &data_source_id) - .await - { - Err(e) => Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), - )), - Ok(ds) => match ds { - None => Err(( - StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), - )), - Some(ds) => Ok(ds), - }, - } -} // List versions of a document in a data source. #[derive(serde::Deserialize)] @@ -1356,15 +1254,11 @@ async fn data_sources_documents_versions_list( ) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to list document versions: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to list document versions", + Some(e), ), Ok((versions, total)) => ( StatusCode::OK, @@ -1410,26 +1304,18 @@ async fn data_sources_documents_upsert( .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve data source", + Some(e), ), Ok(ds) => match ds { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, ), Some(ds) => { match ds @@ -1447,15 +1333,11 @@ async fn data_sources_documents_upsert( ) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to upsert document: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to upsert document", + Some(e), ), Ok(d) => { let mut response_data = json!({ @@ -1514,15 +1396,11 @@ async fn data_sources_documents_list( ) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to list data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to list data source", + Some(e), ), Ok((documents, total)) => ( StatusCode::OK, @@ -1556,50 +1434,34 @@ async fn data_sources_documents_retrieve( .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve data source", + Some(e), ), Ok(ds) => match ds { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, ), Some(ds) => match ds .retrieve(state.store.clone(), &document_id, true, &query.version_hash) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve document: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve document", + Some(e), ), - Ok(None) => ( + Ok(None) => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_document_not_found"), - message: format!("No document found for id `{}`", document_id), - }), - response: None, - }), + "data_source_document_not_found", + &format!("No document found for id `{}`", document_id), + None, ), Ok(Some(d)) => ( StatusCode::OK, @@ -1632,26 +1494,18 @@ async fn data_sources_documents_delete( .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve data source", + Some(e), ), Ok(ds) => match ds { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, ), Some(ds) => match ds .delete_document( @@ -1661,15 +1515,11 @@ async fn data_sources_documents_delete( ) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to delete document: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to delete document", + Some(e), ), Ok(_) => ( StatusCode::OK, @@ -1701,40 +1551,28 @@ async fn data_sources_delete( .load_data_source(&project, &data_source_id) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to retrieve data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to retrieve data source", + Some(e), ), Ok(ds) => match ds { - None => ( + None => error_response( StatusCode::NOT_FOUND, - Json(APIResponse { - error: Some(APIError { - code: String::from("data_source_not_found"), - message: format!("No data source found for id `{}`", data_source_id), - }), - response: None, - }), + "data_source_not_found", + &format!("No data source found for id `{}`", data_source_id), + None, ), Some(ds) => match ds .delete(state.store.clone(), state.qdrant_client.clone()) .await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to delete data source: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to delete data source", + Some(e), ), Ok(_) => ( StatusCode::OK, @@ -1768,15 +1606,11 @@ async fn tokenize( ) -> (StatusCode, Json) { let embedder = provider(payload.provider_id).embedder(payload.model_id); match embedder.encode(&payload.text).await { - Err(e) => ( + Err(e) => error_response( StatusCode::INTERNAL_SERVER_ERROR, - Json(APIResponse { - error: Some(APIError { - code: String::from("internal_server_error"), - message: format!("Failed to tokenize text: {}", e), - }), - response: None, - }), + "internal_server_error", + "Failed to tokenize text", + Some(e), ), Ok(tokens) => ( StatusCode::OK, diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index cba0c0101080..d54052b0e7a1 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -411,13 +411,8 @@ impl DataSource { ) .await?; - self.upsert_field_to_document_payload( - qdrant_client, - document_id, - "parents", - new_parents.clone(), - ) - .await?; + self.update_document_payload(qdrant_client, document_id, "parents", new_parents.clone()) + .await?; Ok(()) } @@ -438,12 +433,12 @@ impl DataSource { &remove_tags, ) .await?; - self.upsert_field_to_document_payload(qdrant_client, document_id, "tags", new_tags.clone()) + self.update_document_payload(qdrant_client, document_id, "tags", new_tags.clone()) .await?; Ok(new_tags) } - async fn upsert_field_to_document_payload( + async fn update_document_payload( &self, qdrant_client: Arc, document_id: String, diff --git a/core/src/stores/postgres.rs b/core/src/stores/postgres.rs index 7933108a0ccf..19dd6c5d22d9 100644 --- a/core/src/stores/postgres.rs +++ b/core/src/stores/postgres.rs @@ -1143,10 +1143,23 @@ impl Store for PostgresStore { ) -> Result<()> { let document_id = document_id.to_string(); let pool = self.pool.clone(); - let mut c = pool.get().await?; - let data_source_row_id = - get_data_source_row_id(project.project_id(), data_source_id.to_string(), &mut c) - .await?; + let c = pool.get().await?; + + let project_id = project.project_id(); + let data_source_id = data_source_id.to_string(); + + let r = c + .query( + "SELECT id FROM data_sources WHERE project = $1 AND data_source_id = $2 LIMIT 1", + &[&project_id, &data_source_id], + ) + .await?; + + let data_source_row_id: i64 = match r.len() { + 0 => Err(anyhow!("Unknown DataSource: {}", data_source_id))?, + 1 => r[0].get(0), + _ => unreachable!(), + }; c.execute( "UPDATE data_sources_documents SET parents = $1 \ @@ -1169,9 +1182,21 @@ impl Store for PostgresStore { let document_id = document_id.to_string(); let pool = self.pool.clone(); let mut c = pool.get().await?; - let data_source_row_id = - get_data_source_row_id(project.project_id(), data_source_id.to_string(), &mut c) - .await?; + let project_id = project.project_id(); + let data_source_id = data_source_id.to_string(); + + let r = c + .query( + "SELECT id FROM data_sources WHERE project = $1 AND data_source_id = $2 LIMIT 1", + &[&project_id, &data_source_id], + ) + .await?; + + let data_source_row_id: i64 = match r.len() { + 0 => Err(anyhow!("Unknown DataSource: {}", data_source_id))?, + 1 => r[0].get(0), + _ => unreachable!(), + }; let tx = c.transaction().await?; @@ -1985,22 +2010,3 @@ impl Store for PostgresStore { Box::new(self.clone()) } } - -async fn get_data_source_row_id( - project_id: i64, - data_source_id: String, - connection: &mut PooledConnection<'_, PostgresConnectionManager>, -) -> Result { - let r = connection - .query( - "SELECT id FROM data_sources WHERE project = $1 AND data_source_id = $2 LIMIT 1", - &[&project_id, &data_source_id], - ) - .await?; - - match r.len() { - 0 => Err(anyhow!("Unknown DataSource: {}", data_source_id))?, - 1 => Ok(r[0].get(0)), - _ => unreachable!(), - } -} From 876225afe342521d69ec3531987d710559d2b1b3 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Thu, 31 Aug 2023 18:05:05 +0200 Subject: [PATCH 04/32] logging --- core/bin/dust_api.rs | 3 ++- core/src/stores/postgres.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs index 98145eab399a..a78d8e62b3f9 100644 --- a/core/bin/dust_api.rs +++ b/core/bin/dust_api.rs @@ -1212,13 +1212,14 @@ fn error_response( message: &str, error: Option, ) -> (StatusCode, Json) { + utils::error(&format!("{}: {}\nError: {:?}", code, message, error)); ( status, Json(APIResponse { error: Some(APIError { code: code.to_string(), message: match error { - Some(err) => format!("{}\nError: {}", message, err), + Some(err) => format!("{}\nError: {:?}", message, err), None => message.to_string(), }, }), diff --git a/core/src/stores/postgres.rs b/core/src/stores/postgres.rs index 19dd6c5d22d9..03a3cece7cfd 100644 --- a/core/src/stores/postgres.rs +++ b/core/src/stores/postgres.rs @@ -13,7 +13,7 @@ use crate::stores::store::{Store, POSTGRES_TABLES, SQL_INDEXES}; use crate::utils; use anyhow::{anyhow, Result}; use async_trait::async_trait; -use bb8::{Pool, PooledConnection}; +use bb8::Pool; use bb8_postgres::PostgresConnectionManager; use serde_json::Value; use std::collections::{HashMap, HashSet}; From ca0e3f3f70c2e616b9e0840bc71e734deae6ed74 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Fri, 1 Sep 2023 14:07:24 +0200 Subject: [PATCH 05/32] review 2 --- core/bin/dust_api.rs | 25 +++++++++++++++---------- core/src/data_sources/data_source.rs | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs index a78d8e62b3f9..3234eb57eb80 100644 --- a/core/bin/dust_api.rs +++ b/core/bin/dust_api.rs @@ -728,15 +728,20 @@ async fn runs_create_stream( }); } Err((_, api_error)) => { - if let Some(error) = &api_error.0.error { - let _ = tx.send(json!({ - "type": "error", - "content": { - "code": error.code, - "message": error.message, - }, - })); - } + let error = match api_error.0.error { + Some(error) => error, + None => APIError { + code: "internal_server_error".to_string(), + message: "The app execution failed unexpectedly".to_string(), + }, + }; + let _ = tx.send(json!({ + "type": "error", + "content": { + "code": error.code, + "message": error.message, + }, + })); } } @@ -1219,7 +1224,7 @@ fn error_response( error: Some(APIError { code: code.to_string(), message: match error { - Some(err) => format!("{}\nError: {:?}", message, err), + Some(err) => format!("{} (error: {:?})", message, err), None => message.to_string(), }, }), diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index d54052b0e7a1..f055499af64b 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -449,7 +449,7 @@ impl DataSource { payload.insert(field_name, field_value.into()); let field_condition = qdrant::FieldCondition { - key: "document_id".to_string(), + key: "document_id_hash".to_string(), r#match: Some(qdrant::Match { match_value: Some(qdrant::r#match::MatchValue::Text(document_id)), }), From 406ca6fc1fad574c2d3b994fbee532a1c04e088b Mon Sep 17 00:00:00 2001 From: philipperolet Date: Fri, 1 Sep 2023 17:05:25 +0200 Subject: [PATCH 06/32] fixed qdrant payload update via document hash --- core/src/data_sources/data_source.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index f055499af64b..a7a620b8dd34 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -402,7 +402,7 @@ impl DataSource { document_id: String, parents: Vec, ) -> Result<()> { - let new_parents = store + store .update_data_source_document_parents( &self.project, &self.data_source_id(), @@ -411,7 +411,11 @@ impl DataSource { ) .await?; - self.update_document_payload(qdrant_client, document_id, "parents", new_parents.clone()) + let mut hasher = blake3::Hasher::new(); + hasher.update(document_id.as_bytes()); + let document_id_hash = format!("{}", hasher.finalize().to_hex()); + + self.update_document_payload(qdrant_client, document_id_hash, "parents", parents) .await?; Ok(()) } @@ -433,7 +437,12 @@ impl DataSource { &remove_tags, ) .await?; - self.update_document_payload(qdrant_client, document_id, "tags", new_tags.clone()) + + let mut hasher = blake3::Hasher::new(); + hasher.update(document_id.as_bytes()); + let document_id_hash = format!("{}", hasher.finalize().to_hex()); + + self.update_document_payload(qdrant_client, document_id_hash, "tags", new_tags.clone()) .await?; Ok(new_tags) } @@ -441,7 +450,7 @@ impl DataSource { async fn update_document_payload( &self, qdrant_client: Arc, - document_id: String, + document_id_hash: String, field_name: &str, field_value: impl Into, ) -> Result<()> { @@ -451,7 +460,7 @@ impl DataSource { let field_condition = qdrant::FieldCondition { key: "document_id_hash".to_string(), r#match: Some(qdrant::Match { - match_value: Some(qdrant::r#match::MatchValue::Text(document_id)), + match_value: Some(qdrant::r#match::MatchValue::Text(document_id_hash)), }), ..Default::default() }; From 730319012ab48001a684d11ffa75d28afe0732b8 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Sat, 2 Sep 2023 18:41:57 +0200 Subject: [PATCH 07/32] WIP --- .../src/connectors/notion/lib/parents.ts | 86 +++++++++++++++++++ .../connectors/notion/temporal/activities.ts | 37 +++++++- 2 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 connectors/src/connectors/notion/lib/parents.ts diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts new file mode 100644 index 000000000000..08af7da06073 --- /dev/null +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -0,0 +1,86 @@ +import { DataSourceInfo } from "@connectors/types/data_source_config"; +import { getNotionPageFromConnectorsDb } from "./connectors_db_helpers"; +import { NotionDatabase, NotionPage } from "@connectors/lib/models"; +import { get } from "http"; + +/** Compute the parents field for a notion document + * See the [Design Doc](TODO) and the field [documentation in core](TODO) for relevant details + */ +export async function getParents( + page: { + notionId: string; + parentType: string | null | undefined; + parentId: string | null | undefined; + }, + dataSourceInfo: DataSourceInfo +): Promise { + const parents: string[] = [page.notionId]; + switch (page.parentType) { + case "workspace": + return parents; + case "block": + // rare cases in which doing something here is useful + // are ignored for now, see the design doc for details + return parents; + case "page": + case "database": + // retrieve the parent from notion connectors db + // and add it to the parents array + let parent = await getNotionPageFromConnectorsDb( + dataSourceInfo, + page.parentId as string // (cannot be null here) + ); + if (!parent) { + // The parent is either not synced yet (not an issue, see design doc) or + // is not in Dust's scope, in both cases we can just return the page id + return parents; + } + return parents.concat( + await getParents( + { + notionId: parent.notionPageId, + parentType: parent.parentType, + parentId: parent.parentId, + }, + dataSourceInfo + ) + ); + default: + throw new Error(`Unhandled parent type ${page.parentType}`); + } +} + +export async function updateParentsField( + pageOrDb: NotionPage | NotionDatabase, + dataSourceInfo: DataSourceInfo, + parents?: string[] +) { + let notionId = + (pageOrDb as NotionPage).notionPageId || + (pageOrDb as NotionDatabase).notionDatabaseId; + + parents = await getParents( + { + notionId, + parentType: pageOrDb.parentType, + parentId: pageOrDb.parentId, + }, + dataSourceInfo + ); + // dbs are not in the Datasource so they don't have a parents field + // only notion pages need an update + (pageOrDb as NotionPage).notionPageId && + updateParentsFieldInDatasource(pageOrDb, parents); + for (const child of getChildren(pageOrDb)) { + await updateParentsField(child, dataSourceInfo, parents); + } +} + +function updateParentsFieldInDb( + pageOrDb: NotionPage | NotionDatabase, + parents: string[] +) {} + +function getChildren(pageOrDb: NotionPage | NotionDatabase) { + return pageOrDb.children; +} diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 1fd7d0b3091e..53103085141c 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -7,6 +7,10 @@ import { upsertNotionDatabaseInConnectorsDb, upsertNotionPageInConnectorsDb, } from "@connectors/connectors/notion/lib/connectors_db_helpers"; +import { + getParents, + updateParentsField, +} from "@connectors/connectors/notion/lib/parents"; import { GARBAGE_COLLECT_MAX_DURATION_MS, isDuringGarbageCollectStartWindow, @@ -36,6 +40,7 @@ import { DataSourceConfig, DataSourceInfo, } from "@connectors/types/data_source_config"; +import { connect } from "http2"; const logger = mainLogger.child({ provider: "notion" }); @@ -240,9 +245,13 @@ export async function notionUpsertPageActivity( const parsedPage = await getParsedPage(accessToken, pageId, loggerArgs); + let needParentsUpdate = + parsedPage?.parentType !== notionPage?.parentType || + parsedPage?.parentId !== notionPage?.parentId; + if (parsedPage && parsedPage.rendered.length > MAX_DOCUMENT_TXT_LEN) { localLogger.info("Skipping page with too large body"); - await upsertNotionPageInConnectorsDb({ + const newNotionPage = await upsertNotionPageInConnectorsDb({ dataSourceInfo: dataSourceConfig, notionPageId: pageId, lastSeenTs: runTimestamp, @@ -253,12 +262,21 @@ export async function notionUpsertPageActivity( lastUpsertedTs: upsertTs, skipReason: "body_too_large", }); + needParentsUpdate && updateParentsField(newNotionPage); return; } if (parsedPage && parsedPage.hasBody) { upsertTs = new Date().getTime(); const documentId = `notion-${parsedPage.id}`; + let parents = await getParents( + { + notionId: pageId, + parentType: parsedPage.parentType, + parentId: parsedPage.parentId, + }, + dataSourceConfig + ); await upsertToDatasource({ dataSourceConfig, documentId, @@ -266,7 +284,7 @@ export async function notionUpsertPageActivity( documentUrl: parsedPage.url, timestampMs: parsedPage.updatedTime, tags: getTagsForPage(parsedPage), - parents: [], + parents, retries: 3, delayBetweenRetriesMs: 500, loggerArgs, @@ -279,7 +297,7 @@ export async function notionUpsertPageActivity( } localLogger.info("notionUpsertPageActivity: Upserting notion page in DB."); - await upsertNotionPageInConnectorsDb({ + const newNotionPage = await upsertNotionPageInConnectorsDb({ dataSourceInfo: dataSourceConfig, notionPageId: pageId, lastSeenTs: runTimestamp, @@ -289,6 +307,7 @@ export async function notionUpsertPageActivity( notionUrl: parsedPage ? parsedPage.url : null, lastUpsertedTs: upsertTs, }); + needParentsUpdate && updateParentsField(newNotionPage); } export async function notionUpsertDatabaseActivity( @@ -329,7 +348,11 @@ export async function notionUpsertDatabaseActivity( const parsedDb = await getParsedDatabase(accessToken, databaseId, loggerArgs); - await upsertNotionDatabaseInConnectorsDb({ + let needParentsUpdate = + parsedDb?.parentType !== notionDatabase?.parentType || + parsedDb?.parentId !== notionDatabase?.parentId; + + const newNotionDb = await upsertNotionDatabaseInConnectorsDb({ dataSourceInfo: dataSourceConfig, notionDatabaseId: databaseId, lastSeenTs: runTimestamp, @@ -338,6 +361,7 @@ export async function notionUpsertDatabaseActivity( title: parsedDb ? parsedDb.title : null, notionUrl: parsedDb ? parsedDb.url : null, }); + needParentsUpdate && updateParentsField(newNotionDb); } export async function saveSuccessSyncActivity( @@ -809,3 +833,8 @@ export async function garbageCollectActivity( lastGarbageCollectionFinishTime: new Date(), }); } + +/*export async function notionFillParentsActivity ( + accessToken: string, + +)*/ From cf71c42aec467bad1edeadb59db9cb84ffdbc377 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Mon, 4 Sep 2023 08:34:58 +0200 Subject: [PATCH 08/32] wip2 --- connectors/src/connectors/notion/lib/parents.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 08af7da06073..0c5854c08446 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -59,7 +59,7 @@ export async function updateParentsField( (pageOrDb as NotionPage).notionPageId || (pageOrDb as NotionDatabase).notionDatabaseId; - parents = await getParents( + parents = parents ? [notionId, ...parents] : await getParents( { notionId, parentType: pageOrDb.parentType, @@ -70,17 +70,16 @@ export async function updateParentsField( // dbs are not in the Datasource so they don't have a parents field // only notion pages need an update (pageOrDb as NotionPage).notionPageId && - updateParentsFieldInDatasource(pageOrDb, parents); + updateParentsFieldInDatasource(pageOrDb as NotionPage, parents); for (const child of getChildren(pageOrDb)) { await updateParentsField(child, dataSourceInfo, parents); } } -function updateParentsFieldInDb( - pageOrDb: NotionPage | NotionDatabase, +function updateParentsFieldInDatasource( + pageOrDb: NotionPage, parents: string[] ) {} function getChildren(pageOrDb: NotionPage | NotionDatabase) { - return pageOrDb.children; } From f2eff930d72f6eb7077875023e9c180fa2d6cd8e Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 10:34:00 +0200 Subject: [PATCH 09/32] wip 3 --- .../notion/lib/connectors_db_helpers.ts | 62 ++++++++++ .../src/connectors/notion/lib/parents.ts | 32 ++--- .../connectors/notion/temporal/activities.ts | 39 +++---- .../connectors/notion/temporal/workflows.ts | 110 ++++++++++++++++-- 4 files changed, 193 insertions(+), 50 deletions(-) diff --git a/connectors/src/connectors/notion/lib/connectors_db_helpers.ts b/connectors/src/connectors/notion/lib/connectors_db_helpers.ts index 60e7ed73ce94..6234d4ec51f6 100644 --- a/connectors/src/connectors/notion/lib/connectors_db_helpers.ts +++ b/connectors/src/connectors/notion/lib/connectors_db_helpers.ts @@ -218,3 +218,65 @@ export async function getNotionDatabaseFromConnectorsDb( return NotionDatabase.findOne({ where }); } + +/** + * Get children *that are pages* of a given notion page or database + * + * !! Not children *of a page* + * @param dataSourceInfo + * @param notionId + * @returns + */ +export async function getPageChildrenOfDocument( + dataSourceInfo: DataSourceInfo, + notionId: string +): Promise { + const connector = await Connector.findOne({ + where: { + type: "notion", + workspaceId: dataSourceInfo.workspaceId, + dataSourceName: dataSourceInfo.dataSourceName, + }, + }); + if (!connector) { + throw new Error("Could not find connector"); + } + + return NotionPage.findAll({ + where: { + parentId: notionId, + connectorId: connector.id, + }, + }); +} + +/** + * Get children *that are databases* of a given notion page or database + * + * !! Not children *of a database* + * @param dataSourceInfo + * @param notionId + * @returns + */ +export async function getDatabaseChildrenOfDocument( + dataSourceInfo: DataSourceInfo, + notionId: string +): Promise { + const connector = await Connector.findOne({ + where: { + type: "notion", + workspaceId: dataSourceInfo.workspaceId, + dataSourceName: dataSourceInfo.dataSourceName, + }, + }); + if (!connector) { + throw new Error("Could not find connector"); + } + + return NotionDatabase.findAll({ + where: { + parentId: notionId, + connectorId: connector.id, + }, + }); +} diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 0c5854c08446..a502311aab94 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -1,21 +1,20 @@ import { DataSourceInfo } from "@connectors/types/data_source_config"; import { getNotionPageFromConnectorsDb } from "./connectors_db_helpers"; import { NotionDatabase, NotionPage } from "@connectors/lib/models"; -import { get } from "http"; /** Compute the parents field for a notion document * See the [Design Doc](TODO) and the field [documentation in core](TODO) for relevant details */ export async function getParents( - page: { + document: { notionId: string; parentType: string | null | undefined; parentId: string | null | undefined; }, dataSourceInfo: DataSourceInfo ): Promise { - const parents: string[] = [page.notionId]; - switch (page.parentType) { + const parents: string[] = [document.notionId]; + switch (document.parentType) { case "workspace": return parents; case "block": @@ -28,7 +27,7 @@ export async function getParents( // and add it to the parents array let parent = await getNotionPageFromConnectorsDb( dataSourceInfo, - page.parentId as string // (cannot be null here) + document.parentId as string // (cannot be null here) ); if (!parent) { // The parent is either not synced yet (not an issue, see design doc) or @@ -46,7 +45,7 @@ export async function getParents( ) ); default: - throw new Error(`Unhandled parent type ${page.parentType}`); + throw new Error(`Unhandled parent type ${document.parentType}`); } } @@ -59,14 +58,16 @@ export async function updateParentsField( (pageOrDb as NotionPage).notionPageId || (pageOrDb as NotionDatabase).notionDatabaseId; - parents = parents ? [notionId, ...parents] : await getParents( - { - notionId, - parentType: pageOrDb.parentType, - parentId: pageOrDb.parentId, - }, - dataSourceInfo - ); + parents = parents + ? [notionId, ...parents] + : await getParents( + { + notionId, + parentType: pageOrDb.parentType, + parentId: pageOrDb.parentId, + }, + dataSourceInfo + ); // dbs are not in the Datasource so they don't have a parents field // only notion pages need an update (pageOrDb as NotionPage).notionPageId && @@ -81,5 +82,4 @@ function updateParentsFieldInDatasource( parents: string[] ) {} -function getChildren(pageOrDb: NotionPage | NotionDatabase) { -} +function getChildren(pageOrDb: NotionPage | NotionDatabase) {} diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 53103085141c..473b75f9864d 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -7,10 +7,7 @@ import { upsertNotionDatabaseInConnectorsDb, upsertNotionPageInConnectorsDb, } from "@connectors/connectors/notion/lib/connectors_db_helpers"; -import { - getParents, - updateParentsField, -} from "@connectors/connectors/notion/lib/parents"; +import { getParents } from "@connectors/connectors/notion/lib/parents"; import { GARBAGE_COLLECT_MAX_DURATION_MS, isDuringGarbageCollectStartWindow, @@ -40,7 +37,6 @@ import { DataSourceConfig, DataSourceInfo, } from "@connectors/types/data_source_config"; -import { connect } from "http2"; const logger = mainLogger.child({ provider: "notion" }); @@ -209,6 +205,11 @@ export async function notionGetToSyncActivity( }; } +export type UpsertActivityResult = { + document: NotionPage | NotionDatabase | null; + createdOrMoved: boolean; +}; + export async function notionUpsertPageActivity( accessToken: string, pageId: string, @@ -216,7 +217,7 @@ export async function notionUpsertPageActivity( runTimestamp: number, loggerArgs: Record, isFullSync: boolean -) { +): Promise { const localLogger = logger.child({ ...loggerArgs, pageId }); const notionPage = await getNotionPageFromConnectorsDb( @@ -228,7 +229,7 @@ export async function notionUpsertPageActivity( if (alreadySeenInRun) { localLogger.info("Skipping page already seen in this run"); - return; + return { document: notionPage, createdOrMoved: false }; } const isSkipped = !!notionPage?.skipReason; @@ -238,14 +239,14 @@ export async function notionUpsertPageActivity( { skipReason: notionPage.skipReason }, "Skipping page with skip reason" ); - return; + return { document: notionPage, createdOrMoved: false }; } let upsertTs: number | undefined = undefined; const parsedPage = await getParsedPage(accessToken, pageId, loggerArgs); - let needParentsUpdate = + let createdOrMoved = parsedPage?.parentType !== notionPage?.parentType || parsedPage?.parentId !== notionPage?.parentId; @@ -262,8 +263,7 @@ export async function notionUpsertPageActivity( lastUpsertedTs: upsertTs, skipReason: "body_too_large", }); - needParentsUpdate && updateParentsField(newNotionPage); - return; + return { document: newNotionPage, createdOrMoved }; } if (parsedPage && parsedPage.hasBody) { @@ -307,7 +307,7 @@ export async function notionUpsertPageActivity( notionUrl: parsedPage ? parsedPage.url : null, lastUpsertedTs: upsertTs, }); - needParentsUpdate && updateParentsField(newNotionPage); + return { document: newNotionPage, createdOrMoved }; } export async function notionUpsertDatabaseActivity( @@ -316,7 +316,7 @@ export async function notionUpsertDatabaseActivity( dataSourceConfig: DataSourceConfig, runTimestamp: number, loggerArgs: Record -) { +): Promise { const localLogger = logger.child({ ...loggerArgs, databaseId }); const notionDatabase = await getNotionDatabaseFromConnectorsDb( @@ -329,7 +329,7 @@ export async function notionUpsertDatabaseActivity( if (alreadySeenInRun) { localLogger.info("Skipping database already seen in this run"); - return; + return { document: notionDatabase, createdOrMoved: false }; } const isSkipped = !!notionDatabase?.skipReason; @@ -339,7 +339,7 @@ export async function notionUpsertDatabaseActivity( { skipReason: notionDatabase.skipReason }, "Skipping database with skip reason" ); - return; + return { document: notionDatabase, createdOrMoved: false }; } localLogger.info( @@ -348,7 +348,7 @@ export async function notionUpsertDatabaseActivity( const parsedDb = await getParsedDatabase(accessToken, databaseId, loggerArgs); - let needParentsUpdate = + let createdOrMoved = parsedDb?.parentType !== notionDatabase?.parentType || parsedDb?.parentId !== notionDatabase?.parentId; @@ -361,7 +361,7 @@ export async function notionUpsertDatabaseActivity( title: parsedDb ? parsedDb.title : null, notionUrl: parsedDb ? parsedDb.url : null, }); - needParentsUpdate && updateParentsField(newNotionDb); + return { document: notionDatabase, createdOrMoved: createdOrMoved }; } export async function saveSuccessSyncActivity( @@ -833,8 +833,3 @@ export async function garbageCollectActivity( lastGarbageCollectionFinishTime: new Date(), }); } - -/*export async function notionFillParentsActivity ( - accessToken: string, - -)*/ diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 60cbaa3ef178..4089e931481c 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -13,6 +13,15 @@ import type * as activities from "@connectors/connectors/notion/temporal/activit import { DataSourceConfig } from "@connectors/types/data_source_config"; import { getWorkflowId } from "./utils"; +import { UpsertActivityResult } from "@connectors/connectors/notion/temporal/activities"; +import { updateParentsField } from "../lib/parents"; +import { + getDatabaseChildrenOfDocument, + getNotionDatabaseFromConnectorsDb, + getNotionPageFromConnectorsDb, + getPageChildrenOfDocument, +} from "../lib/connectors_db_helpers"; +import { NotionDatabase, NotionPage } from "@connectors/lib/models"; const { garbageCollectActivity } = proxyActivities({ startToCloseTimeout: "120 minute", @@ -89,7 +98,7 @@ export async function notionSyncWorkflow( const childWorkflowQueue = new PQueue({ concurrency: MAX_CONCURRENT_CHILD_WORKFLOWS, }); - const promises: Promise[] = []; + const promises: Promise[] = []; do { const { pageIds, databaseIds, nextCursor } = @@ -170,7 +179,7 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) + ) as Promise ); } } @@ -204,14 +213,19 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) + ) as Promise ); } } } while (cursor); // wait for all child workflows to finish - await Promise.all(promises); + const syncWorkflowResults = await Promise.all(promises); + + await executeChild(updateParentsFieldsWorkflow, { + args: [dataSourceConfig, syncWorkflowResults], + parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, + }); if (!isGargageCollectionRun) { await saveSuccessSyncActivity(dataSourceConfig); @@ -246,18 +260,22 @@ export async function notionSyncWorkflow( ); } +type SyncWorkflowResult = { + activitiesResults: UpsertActivityResult[]; +}; + export async function notionSyncResultPageWorkflow( dataSourceConfig: DataSourceConfig, notionAccessToken: string, pageIds: string[], runTimestamp: number, isBatchSync = false -) { +): Promise { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); - const promises: Promise[] = []; + const promises: Promise[] = []; for (const [pageIndex, pageId] of pageIds.entries()) { const loggerArgs = { @@ -275,11 +293,11 @@ export async function notionSyncResultPageWorkflow( loggerArgs, isBatchSync ) - ) + ) as Promise ); } - await Promise.all(promises); + return { activitiesResults: await Promise.all(promises) }; } export async function notionSyncResultPageDatabaseWorkflow( @@ -287,12 +305,12 @@ export async function notionSyncResultPageDatabaseWorkflow( notionAccessToken: string, databaseIds: string[], runTimestamp: number -) { +): Promise { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); - const promises: Promise[] = []; + const promises: Promise[] = []; for (const [databaseIndex, databaseId] of databaseIds.entries()) { const loggerArgs = { @@ -309,9 +327,77 @@ export async function notionSyncResultPageDatabaseWorkflow( runTimestamp, loggerArgs ) - ) + ) as Promise ); } + return { activitiesResults: await Promise.all(promises) }; +} + +export async function updateParentsFieldsWorkflow( + dataSourceConfig: DataSourceConfig, + syncWorkflowResults: SyncWorkflowResult[] +) { + // Get documents whose path changed (created or moved) If there is + // createdOrMoved, then the document cannot be null thus the cast is safe + const documents = syncWorkflowResults.flatMap((result) => + result.activitiesResults + .filter((aRes) => aRes.createdOrMoved) + .map((aRes) => aRes.document) + ) as (NotionPage | NotionDatabase)[]; + + /* Computing all descendants, then updating, ensures the field is updated only + once per page, limiting the load on the Datasource */ + const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); + + // Update everybody's parents field + for (const page of pagesToUpdate) { + await updateParentsFieldInDatasource(page); + handleErrors(); + } +} + +/** Get ids of all pages whose parents field should be updated: inital pages in + * documentIds, and all the descendants of documentIds that are pages (including + * children of databases) + * + * Note: databases are not stored in the Datasource, so they don't need to be + * updated + */ +async function getPagesToUpdate( + documents: (NotionPage | NotionDatabase)[], + dataSourceConfig: DataSourceConfig +): Promise { + const documentVisitedIds = new Set(documents); + + // documents is a queue of documents whose children should be fetched + while (documents.length !== 0) { + const document = documents.shift()!; + + // Get children of the document + const documentId = + (document as NotionPage).notionPageId || + (document as NotionDatabase).notionDatabaseId; + const pageChildren = await getPageChildrenOfDocument( + dataSourceConfig, + documentId + ); + const databaseChildren = await getDatabaseChildrenOfDocument( + dataSourceConfig, + documentId + ); + + // If they haven't yet been visited, add them to documents visited + // and to the list of documents whose children should be fetched + for (const child of [...pageChildren, ...databaseChildren]) { + if (!documentVisitedIds.has(child)) { + documentVisitedIds.add(child); + documents.push(child); + } + } + } - await Promise.all(promises); + // only return pages since databases are not updated + return Array.from(documentVisitedIds).filter( + (d) => (d as NotionPage).notionPageId + ) as NotionPage[]; } From 9a30d02d27e371a683f3edce6a4c7d4fcacb7c28 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 15:13:47 +0200 Subject: [PATCH 10/32] wip 4 --- .../src/connectors/notion/lib/parents.ts | 128 ++++++++++++------ .../connectors/notion/temporal/activities.ts | 30 ++-- .../connectors/notion/temporal/workflows.ts | 96 ++----------- connectors/src/lib/data_sources.ts | 44 ++++++ 4 files changed, 163 insertions(+), 135 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index a502311aab94..0d510c973fde 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -1,17 +1,26 @@ -import { DataSourceInfo } from "@connectors/types/data_source_config"; -import { getNotionPageFromConnectorsDb } from "./connectors_db_helpers"; +import { + DataSourceConfig, + DataSourceInfo, +} from "@connectors/types/data_source_config"; +import { + getDatabaseChildrenOfDocument, + getNotionPageFromConnectorsDb, + getPageChildrenOfDocument, +} from "./connectors_db_helpers"; import { NotionDatabase, NotionPage } from "@connectors/lib/models"; +import memoize from "lodash.memoize"; +import { updateDocumentParentsField } from "@connectors/lib/data_sources"; /** Compute the parents field for a notion document * See the [Design Doc](TODO) and the field [documentation in core](TODO) for relevant details */ -export async function getParents( +async function _getParents( + dataSourceInfo: DataSourceInfo, document: { notionId: string; parentType: string | null | undefined; parentId: string | null | undefined; - }, - dataSourceInfo: DataSourceInfo + } ): Promise { const parents: string[] = [document.notionId]; switch (document.parentType) { @@ -35,51 +44,88 @@ export async function getParents( return parents; } return parents.concat( - await getParents( - { - notionId: parent.notionPageId, - parentType: parent.parentType, - parentId: parent.parentId, - }, - dataSourceInfo - ) + await getParents(dataSourceInfo, { + notionId: parent.notionPageId, + parentType: parent.parentType, + parentId: parent.parentId, + }) ); default: throw new Error(`Unhandled parent type ${document.parentType}`); } } -export async function updateParentsField( - pageOrDb: NotionPage | NotionDatabase, - dataSourceInfo: DataSourceInfo, - parents?: string[] +export const getParents = memoize(_getParents, (dataSourceInfo, document) => { + return `${dataSourceInfo.dataSourceName}:${document.notionId}`; +}); + +export async function updateAllParentsFields( + dataSourceConfig: DataSourceConfig, + documents: (NotionPage | NotionDatabase)[] ) { - let notionId = - (pageOrDb as NotionPage).notionPageId || - (pageOrDb as NotionDatabase).notionDatabaseId; + /* Computing all descendants, then updating, ensures the field is updated only + once per page, limiting the load on the Datasource */ + const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); - parents = parents - ? [notionId, ...parents] - : await getParents( - { - notionId, - parentType: pageOrDb.parentType, - parentId: pageOrDb.parentId, - }, - dataSourceInfo - ); - // dbs are not in the Datasource so they don't have a parents field - // only notion pages need an update - (pageOrDb as NotionPage).notionPageId && - updateParentsFieldInDatasource(pageOrDb as NotionPage, parents); - for (const child of getChildren(pageOrDb)) { - await updateParentsField(child, dataSourceInfo, parents); + // Update everybody's parents field + for (const page of pagesToUpdate) { + const parents = await getParents(dataSourceConfig, { + notionId: page.notionPageId, + parentType: page.parentType, + parentId: page.parentId, + }); + + await updateDocumentParentsField( + dataSourceConfig, + `notion-${page.notionPageId}`, + parents + ); + // TODO how to handle errors here } } -function updateParentsFieldInDatasource( - pageOrDb: NotionPage, - parents: string[] -) {} +/** Get ids of all pages whose parents field should be updated: inital pages in + * documentIds, and all the descendants of documentIds that are pages (including + * children of databases) + * + * Note: databases are not stored in the Datasource, so they don't need to be + * updated + */ +async function getPagesToUpdate( + documents: (NotionPage | NotionDatabase)[], + dataSourceConfig: DataSourceConfig +): Promise { + const documentVisitedIds = new Set(documents); -function getChildren(pageOrDb: NotionPage | NotionDatabase) {} + // documents is a queue of documents whose children should be fetched + while (documents.length !== 0) { + const document = documents.shift()!; + + // Get children of the document + const documentId = + (document as NotionPage).notionPageId || + (document as NotionDatabase).notionDatabaseId; + const pageChildren = await getPageChildrenOfDocument( + dataSourceConfig, + documentId + ); + const databaseChildren = await getDatabaseChildrenOfDocument( + dataSourceConfig, + documentId + ); + + // If they haven't yet been visited, add them to documents visited + // and to the list of documents whose children should be fetched + for (const child of [...pageChildren, ...databaseChildren]) { + if (!documentVisitedIds.has(child)) { + documentVisitedIds.add(child); + documents.push(child); + } + } + } + + // only return pages since databases are not updated + return Array.from(documentVisitedIds).filter( + (d) => (d as NotionPage).notionPageId + ) as NotionPage[]; +} diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 473b75f9864d..08fce178e418 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -7,7 +7,10 @@ import { upsertNotionDatabaseInConnectorsDb, upsertNotionPageInConnectorsDb, } from "@connectors/connectors/notion/lib/connectors_db_helpers"; -import { getParents } from "@connectors/connectors/notion/lib/parents"; +import { + getParents, + updateAllParentsFields, +} from "@connectors/connectors/notion/lib/parents"; import { GARBAGE_COLLECT_MAX_DURATION_MS, isDuringGarbageCollectStartWindow, @@ -269,14 +272,11 @@ export async function notionUpsertPageActivity( if (parsedPage && parsedPage.hasBody) { upsertTs = new Date().getTime(); const documentId = `notion-${parsedPage.id}`; - let parents = await getParents( - { - notionId: pageId, - parentType: parsedPage.parentType, - parentId: parsedPage.parentId, - }, - dataSourceConfig - ); + let parents = await getParents(dataSourceConfig, { + notionId: pageId, + parentType: parsedPage.parentType, + parentId: parsedPage.parentId, + }); await upsertToDatasource({ dataSourceConfig, documentId, @@ -833,3 +833,15 @@ export async function garbageCollectActivity( lastGarbageCollectionFinishTime: new Date(), }); } + +export async function updateParentsFieldsActivity( + dataSourceConfig: DataSourceConfig, + activitiesResults: UpsertActivityResult[] +) { + // Get documents whose path changed (created or moved) If there is + // createdOrMoved, then the document cannot be null thus the cast is safe + const documents = activitiesResults + .filter((aRes) => aRes.createdOrMoved) + .map((aRes) => aRes.document) as (NotionPage | NotionDatabase)[]; + await updateAllParentsFields(dataSourceConfig, documents); +} diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 4089e931481c..15465b4a6083 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -14,14 +14,6 @@ import { DataSourceConfig } from "@connectors/types/data_source_config"; import { getWorkflowId } from "./utils"; import { UpsertActivityResult } from "@connectors/connectors/notion/temporal/activities"; -import { updateParentsField } from "../lib/parents"; -import { - getDatabaseChildrenOfDocument, - getNotionDatabaseFromConnectorsDb, - getNotionPageFromConnectorsDb, - getPageChildrenOfDocument, -} from "../lib/connectors_db_helpers"; -import { NotionDatabase, NotionPage } from "@connectors/lib/models"; const { garbageCollectActivity } = proxyActivities({ startToCloseTimeout: "120 minute", @@ -32,10 +24,13 @@ const { notionUpsertPageActivity, notionUpsertDatabaseActivity } = startToCloseTimeout: "60 minute", }); -const { notionGetToSyncActivity, syncGarbageCollectorActivity } = - proxyActivities({ - startToCloseTimeout: "10 minute", - }); +const { + notionGetToSyncActivity, + syncGarbageCollectorActivity, + updateParentsFieldsActivity, +} = proxyActivities({ + startToCloseTimeout: "10 minute", +}); const { saveSuccessSyncActivity, @@ -222,10 +217,10 @@ export async function notionSyncWorkflow( // wait for all child workflows to finish const syncWorkflowResults = await Promise.all(promises); - await executeChild(updateParentsFieldsWorkflow, { - args: [dataSourceConfig, syncWorkflowResults], - parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, - }); + await updateParentsFieldsActivity( + dataSourceConfig, + syncWorkflowResults.flatMap((r) => r.activitiesResults) + ); if (!isGargageCollectionRun) { await saveSuccessSyncActivity(dataSourceConfig); @@ -332,72 +327,3 @@ export async function notionSyncResultPageDatabaseWorkflow( } return { activitiesResults: await Promise.all(promises) }; } - -export async function updateParentsFieldsWorkflow( - dataSourceConfig: DataSourceConfig, - syncWorkflowResults: SyncWorkflowResult[] -) { - // Get documents whose path changed (created or moved) If there is - // createdOrMoved, then the document cannot be null thus the cast is safe - const documents = syncWorkflowResults.flatMap((result) => - result.activitiesResults - .filter((aRes) => aRes.createdOrMoved) - .map((aRes) => aRes.document) - ) as (NotionPage | NotionDatabase)[]; - - /* Computing all descendants, then updating, ensures the field is updated only - once per page, limiting the load on the Datasource */ - const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); - - // Update everybody's parents field - for (const page of pagesToUpdate) { - await updateParentsFieldInDatasource(page); - handleErrors(); - } -} - -/** Get ids of all pages whose parents field should be updated: inital pages in - * documentIds, and all the descendants of documentIds that are pages (including - * children of databases) - * - * Note: databases are not stored in the Datasource, so they don't need to be - * updated - */ -async function getPagesToUpdate( - documents: (NotionPage | NotionDatabase)[], - dataSourceConfig: DataSourceConfig -): Promise { - const documentVisitedIds = new Set(documents); - - // documents is a queue of documents whose children should be fetched - while (documents.length !== 0) { - const document = documents.shift()!; - - // Get children of the document - const documentId = - (document as NotionPage).notionPageId || - (document as NotionDatabase).notionDatabaseId; - const pageChildren = await getPageChildrenOfDocument( - dataSourceConfig, - documentId - ); - const databaseChildren = await getDatabaseChildrenOfDocument( - dataSourceConfig, - documentId - ); - - // If they haven't yet been visited, add them to documents visited - // and to the list of documents whose children should be fetched - for (const child of [...pageChildren, ...databaseChildren]) { - if (!documentVisitedIds.has(child)) { - documentVisitedIds.add(child); - documents.push(child); - } - } - } - - // only return pages since databases are not updated - return Array.from(documentVisitedIds).filter( - (d) => (d as NotionPage).notionPageId - ) as NotionPage[]; -} diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 47c9e83127d3..83660bc798f4 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -195,3 +195,47 @@ export async function deleteFromDataSource( throw new Error(`Error deleting from dust: ${dustRequestResult}`); } } + +export async function updateDocumentParentsField( + dataSourceConfig: DataSourceConfig, + documentId: string, + parents: string[], + loggerArgs: Record = {} +) { + const localLogger = logger.child({ ...loggerArgs, documentId }); + const urlSafeName = encodeURIComponent(dataSourceConfig.dataSourceName); + const endpoint = `${FRONT_API}/api/v1/w/${dataSourceConfig.workspaceId}/data_sources/${urlSafeName}/documents/${documentId}/parents`; + const dustRequestConfig: AxiosRequestConfig = { + headers: { + Authorization: `Bearer ${dataSourceConfig.workspaceAPIKey}`, + }, + }; + + let dustRequestResult: AxiosResponse; + try { + dustRequestResult = await axios.patch( + endpoint, + { + parents: parents, + }, + dustRequestConfig + ); + } catch (e) { + throw e; + } + + if (dustRequestResult.status >= 200 && dustRequestResult.status < 300) { + return; + } else { + localLogger.error( + { + status: dustRequestResult.status, + data: dustRequestResult.data, + }, + "Error updating document parents field." + ); + throw new Error( + `Error updating document parents field: ${dustRequestResult}` + ); + } +} From 2c7efc6c386cfc7618ea42c7bacdcb97bdd9d30b Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 16:07:34 +0200 Subject: [PATCH 11/32] add parents endpoint to front --- front/lib/core_api.ts | 32 ++++++ .../[name]/documents/[documentId]/parents.ts | 108 ++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 front/pages/api/v1/w/[wId]/data_sources/[name]/documents/[documentId]/parents.ts diff --git a/front/lib/core_api.ts b/front/lib/core_api.ts index 20f060ffc0a1..e598fd043823 100644 --- a/front/lib/core_api.ts +++ b/front/lib/core_api.ts @@ -696,6 +696,38 @@ export const CoreAPI = { return _resultFromResponse(response); }, + async updateDataSourceDocumentParents({ + projectId, + dataSourceName, + documentId, + parents, + }: { + projectId: string; + dataSourceName: string; + documentId: string; + parents: string[]; + }): Promise< + CoreAPIResponse<{ + data_source: CoreAPIDataSource; + }> + > { + const response = await fetch( + `${CORE_API}/projects/${projectId}/data_sources/${dataSourceName}/documents/${encodeURIComponent( + documentId + )}/parents`, + { + method: "PATCH", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + parents: parents, + }), + } + ); + + return _resultFromResponse(response); + }, async deleteDataSourceDocument({ projectId, diff --git a/front/pages/api/v1/w/[wId]/data_sources/[name]/documents/[documentId]/parents.ts b/front/pages/api/v1/w/[wId]/data_sources/[name]/documents/[documentId]/parents.ts new file mode 100644 index 000000000000..7713a2331eee --- /dev/null +++ b/front/pages/api/v1/w/[wId]/data_sources/[name]/documents/[documentId]/parents.ts @@ -0,0 +1,108 @@ +import { NextApiRequest, NextApiResponse } from "next"; + +import { getDataSource } from "@app/lib/api/data_sources"; +import { Authenticator, getAPIKey } from "@app/lib/auth"; +import { CoreAPI } from "@app/lib/core_api"; +import { ReturnedAPIErrorType } from "@app/lib/error"; +import { apiError, withLogging } from "@app/logger/withlogging"; + +export type PostParentsResponseBody = { + updated: true; +}; + +async function handler( + req: NextApiRequest, + res: NextApiResponse +): Promise { + const keyRes = await getAPIKey(req); + if (keyRes.isErr()) { + return apiError(req, res, keyRes.error); + } + const { auth } = await Authenticator.fromKey( + keyRes.value, + req.query.wId as string + ); + + const owner = auth.workspace(); + if (!owner) { + return apiError(req, res, { + status_code: 404, + api_error: { + type: "data_source_not_found", + message: "The data source you requested was not found.", + }, + }); + } + + const dataSource = await getDataSource(auth, req.query.name as string); + + if (!dataSource) { + return apiError(req, res, { + status_code: 404, + api_error: { + type: "data_source_not_found", + message: "The data source you requested was not found.", + }, + }); + } + + switch (req.method) { + case "POST": + if (!auth.isBuilder()) { + return apiError(req, res, { + status_code: 403, + api_error: { + type: "data_source_auth_error", + message: + "You can only alter the data souces of the workspaces for which you are a builder.", + }, + }); + } + + if ( + !req.body || + !Array.isArray(req.body.parents) || + !req.body.parents.every((p: any) => typeof p === "string") + ) { + return apiError(req, res, { + status_code: 400, + api_error: { + type: "invalid_request_error", + message: "Invalid request body, `parents` (string[]) is required.", + }, + }); + } + + const updateRes = await CoreAPI.updateDataSourceDocumentParents({ + projectId: dataSource.dustAPIProjectId, + dataSourceName: dataSource.name, + documentId: req.query.documentId as string, + parents: req.body.parents, + }); + + if (updateRes.isErr()) { + return apiError(req, res, { + status_code: 500, + api_error: { + type: "internal_server_error", + message: "There was an error upserting the document.", + data_source_error: updateRes.error, + }, + }); + } + + res.status(200).json({ updated: true }); + return; + + default: + return apiError(req, res, { + status_code: 405, + api_error: { + type: "method_not_supported_error", + message: "The method passed is not supported, POST is expected.", + }, + }); + } +} + +export default withLogging(handler); From 5628ee98daf5f686cbbfd8a2f36cc8295780ae29 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 16:18:18 +0200 Subject: [PATCH 12/32] cleaning --- connectors/src/connectors/notion/lib/parents.ts | 15 +++++++++------ .../src/connectors/notion/temporal/activities.ts | 16 ++++++++-------- .../src/connectors/notion/temporal/workflows.ts | 2 +- connectors/src/lib/data_sources.ts | 1 + 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 0d510c973fde..10f9d3dd34fc 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -1,15 +1,17 @@ +import memoize from "lodash.memoize"; + +import { updateDocumentParentsField } from "@connectors/lib/data_sources"; +import { NotionDatabase, NotionPage } from "@connectors/lib/models"; import { DataSourceConfig, DataSourceInfo, } from "@connectors/types/data_source_config"; + import { getDatabaseChildrenOfDocument, getNotionPageFromConnectorsDb, getPageChildrenOfDocument, } from "./connectors_db_helpers"; -import { NotionDatabase, NotionPage } from "@connectors/lib/models"; -import memoize from "lodash.memoize"; -import { updateDocumentParentsField } from "@connectors/lib/data_sources"; /** Compute the parents field for a notion document * See the [Design Doc](TODO) and the field [documentation in core](TODO) for relevant details @@ -31,10 +33,10 @@ async function _getParents( // are ignored for now, see the design doc for details return parents; case "page": - case "database": + case "database": { // retrieve the parent from notion connectors db // and add it to the parents array - let parent = await getNotionPageFromConnectorsDb( + const parent = await getNotionPageFromConnectorsDb( dataSourceInfo, document.parentId as string // (cannot be null here) ); @@ -50,6 +52,7 @@ async function _getParents( parentId: parent.parentId, }) ); + } default: throw new Error(`Unhandled parent type ${document.parentType}`); } @@ -99,7 +102,7 @@ async function getPagesToUpdate( // documents is a queue of documents whose children should be fetched while (documents.length !== 0) { - const document = documents.shift()!; + const document = documents.shift(); // Get children of the document const documentId = diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 08fce178e418..ba5d5e41ce72 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -7,10 +7,6 @@ import { upsertNotionDatabaseInConnectorsDb, upsertNotionPageInConnectorsDb, } from "@connectors/connectors/notion/lib/connectors_db_helpers"; -import { - getParents, - updateAllParentsFields, -} from "@connectors/connectors/notion/lib/parents"; import { GARBAGE_COLLECT_MAX_DURATION_MS, isDuringGarbageCollectStartWindow, @@ -21,6 +17,10 @@ import { getParsedPage, isAccessibleAndUnarchived, } from "@connectors/connectors/notion/lib/notion_api"; +import { + getParents, + updateAllParentsFields, +} from "@connectors/connectors/notion/lib/parents"; import { getTagsForPage } from "@connectors/connectors/notion/lib/tags"; import { deleteFromDataSource, @@ -249,7 +249,7 @@ export async function notionUpsertPageActivity( const parsedPage = await getParsedPage(accessToken, pageId, loggerArgs); - let createdOrMoved = + const createdOrMoved = parsedPage?.parentType !== notionPage?.parentType || parsedPage?.parentId !== notionPage?.parentId; @@ -272,7 +272,7 @@ export async function notionUpsertPageActivity( if (parsedPage && parsedPage.hasBody) { upsertTs = new Date().getTime(); const documentId = `notion-${parsedPage.id}`; - let parents = await getParents(dataSourceConfig, { + const parents = await getParents(dataSourceConfig, { notionId: pageId, parentType: parsedPage.parentType, parentId: parsedPage.parentId, @@ -348,7 +348,7 @@ export async function notionUpsertDatabaseActivity( const parsedDb = await getParsedDatabase(accessToken, databaseId, loggerArgs); - let createdOrMoved = + const createdOrMoved = parsedDb?.parentType !== notionDatabase?.parentType || parsedDb?.parentId !== notionDatabase?.parentId; @@ -361,7 +361,7 @@ export async function notionUpsertDatabaseActivity( title: parsedDb ? parsedDb.title : null, notionUrl: parsedDb ? parsedDb.url : null, }); - return { document: notionDatabase, createdOrMoved: createdOrMoved }; + return { document: newNotionDb, createdOrMoved: createdOrMoved }; } export async function saveSuccessSyncActivity( diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 15465b4a6083..3b6793156d51 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -10,10 +10,10 @@ import { import PQueue from "p-queue"; import type * as activities from "@connectors/connectors/notion/temporal/activities"; +import { UpsertActivityResult } from "@connectors/connectors/notion/temporal/activities"; import { DataSourceConfig } from "@connectors/types/data_source_config"; import { getWorkflowId } from "./utils"; -import { UpsertActivityResult } from "@connectors/connectors/notion/temporal/activities"; const { garbageCollectActivity } = proxyActivities({ startToCloseTimeout: "120 minute", diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 83660bc798f4..b44a7129ed99 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -221,6 +221,7 @@ export async function updateDocumentParentsField( dustRequestConfig ); } catch (e) { + localLogger.error({ error: e }, "Error updating document parents field."); throw e; } From 120e2de8b1eacb6cbb1c22be70a2268bb59554ec Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 18:16:29 +0200 Subject: [PATCH 13/32] removed js Set in parents (semantics don't work) --- .../src/connectors/notion/lib/parents.ts | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 10f9d3dd34fc..38921be3b9be 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -98,37 +98,45 @@ async function getPagesToUpdate( documents: (NotionPage | NotionDatabase)[], dataSourceConfig: DataSourceConfig ): Promise { - const documentVisitedIds = new Set(documents); + const pagesToUpdate: NotionPage[] = []; - // documents is a queue of documents whose children should be fetched - while (documents.length !== 0) { - const document = documents.shift(); + let i = 0; + while (i < documents.length) { + // Visit next document and if it's a page add it to update list + const document = documents[i++]!; + let documentId = notionId(document); + if (document instanceof NotionPage) { + pagesToUpdate.push(document); + } // Get children of the document - const documentId = - (document as NotionPage).notionPageId || - (document as NotionDatabase).notionDatabaseId; const pageChildren = await getPageChildrenOfDocument( dataSourceConfig, - documentId + documentId! ); const databaseChildren = await getDatabaseChildrenOfDocument( dataSourceConfig, - documentId + documentId! ); // If they haven't yet been visited, add them to documents visited // and to the list of documents whose children should be fetched for (const child of [...pageChildren, ...databaseChildren]) { - if (!documentVisitedIds.has(child)) { - documentVisitedIds.add(child); + if (!documents.some((d) => notionId(d) === notionId(child))) { documents.push(child); } } } - // only return pages since databases are not updated - return Array.from(documentVisitedIds).filter( - (d) => (d as NotionPage).notionPageId - ) as NotionPage[]; + return pagesToUpdate; +} + +function notionId(document: NotionPage | NotionDatabase): string { + if (document instanceof NotionPage) { + return document.notionPageId; + } else if (document instanceof NotionDatabase) { + return document.notionDatabaseId; + } else { + throw new Error("Unexpected error"); + } } From b8a3091a1eda2a31bee41d716a28fcc080b88685 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Tue, 5 Sep 2023 18:34:13 +0200 Subject: [PATCH 14/32] cleaning --- connectors/src/connectors/notion/lib/parents.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 38921be3b9be..b2719935ec5d 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -13,8 +13,9 @@ import { getPageChildrenOfDocument, } from "./connectors_db_helpers"; -/** Compute the parents field for a notion document - * See the [Design Doc](TODO) and the field [documentation in core](TODO) for relevant details +/** Compute the parents field for a notion document See the [Design + * Doc](https://www.notion.so/dust-tt/Engineering-e0f834b5be5a43569baaf76e9c41adf2?p=3d26536a4e0a464eae0c3f8f27a7af97&pm=s) + * and the field documentation [in core]() for relevant details */ async function _getParents( dataSourceInfo: DataSourceInfo, @@ -83,7 +84,6 @@ export async function updateAllParentsFields( `notion-${page.notionPageId}`, parents ); - // TODO how to handle errors here } } From 2462e982c41268452f1b06f71430a4c32efb94a0 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 09:58:42 +0200 Subject: [PATCH 15/32] remove SyncWorkflowResult --- .../connectors/notion/temporal/workflows.ts | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 3b6793156d51..8eb9aeb22ed7 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -93,7 +93,7 @@ export async function notionSyncWorkflow( const childWorkflowQueue = new PQueue({ concurrency: MAX_CONCURRENT_CHILD_WORKFLOWS, }); - const promises: Promise[] = []; + const promises: Promise[] = []; do { const { pageIds, databaseIds, nextCursor } = @@ -174,7 +174,7 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) as Promise + ) as Promise ); } } @@ -208,7 +208,7 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) as Promise + ) as Promise ); } } @@ -219,7 +219,7 @@ export async function notionSyncWorkflow( await updateParentsFieldsActivity( dataSourceConfig, - syncWorkflowResults.flatMap((r) => r.activitiesResults) + syncWorkflowResults.flat() ); if (!isGargageCollectionRun) { @@ -255,17 +255,13 @@ export async function notionSyncWorkflow( ); } -type SyncWorkflowResult = { - activitiesResults: UpsertActivityResult[]; -}; - export async function notionSyncResultPageWorkflow( dataSourceConfig: DataSourceConfig, notionAccessToken: string, pageIds: string[], runTimestamp: number, isBatchSync = false -): Promise { +): Promise { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); @@ -292,7 +288,7 @@ export async function notionSyncResultPageWorkflow( ); } - return { activitiesResults: await Promise.all(promises) }; + return await Promise.all(promises); } export async function notionSyncResultPageDatabaseWorkflow( @@ -300,7 +296,7 @@ export async function notionSyncResultPageDatabaseWorkflow( notionAccessToken: string, databaseIds: string[], runTimestamp: number -): Promise { +): Promise { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); @@ -325,5 +321,5 @@ export async function notionSyncResultPageDatabaseWorkflow( ) as Promise ); } - return { activitiesResults: await Promise.all(promises) }; + return await Promise.all(promises); } From a350434609dca4eea74c5e5c79c44b4fa039d6a3 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 10:23:52 +0200 Subject: [PATCH 16/32] controlled memoization of getParents --- .../src/connectors/notion/lib/parents.ts | 41 +++++++++++++------ .../connectors/notion/temporal/activities.ts | 14 ++++--- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index b2719935ec5d..94050ccb339e 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -12,10 +12,16 @@ import { getNotionPageFromConnectorsDb, getPageChildrenOfDocument, } from "./connectors_db_helpers"; +import { uuid4 } from "@temporalio/workflow"; /** Compute the parents field for a notion document See the [Design * Doc](https://www.notion.so/dust-tt/Engineering-e0f834b5be5a43569baaf76e9c41adf2?p=3d26536a4e0a464eae0c3f8f27a7af97&pm=s) - * and the field documentation [in core]() for relevant details + * and the field documentation [in + * core](https://github.com/dust-tt/dust/blob/main/core/src/data_sources/data_source.rs) + * for relevant details + * + * @param memoizationKey optional key to control memoization of this function + * */ async function _getParents( dataSourceInfo: DataSourceInfo, @@ -23,7 +29,8 @@ async function _getParents( notionId: string; parentType: string | null | undefined; parentId: string | null | undefined; - } + }, + memoizationKey?: string ): Promise { const parents: string[] = [document.notionId]; switch (document.parentType) { @@ -59,9 +66,12 @@ async function _getParents( } } -export const getParents = memoize(_getParents, (dataSourceInfo, document) => { - return `${dataSourceInfo.dataSourceName}:${document.notionId}`; -}); +export const getParents = memoize( + _getParents, + (dataSourceInfo, document, memoizationKey) => { + return `${dataSourceInfo.dataSourceName}:${document.notionId}:${memoizationKey}`; + } +); export async function updateAllParentsFields( dataSourceConfig: DataSourceConfig, @@ -71,13 +81,20 @@ export async function updateAllParentsFields( once per page, limiting the load on the Datasource */ const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); - // Update everybody's parents field + // Update everybody's parents field. Use of a memoization key to avoid + // potentially sharing memoization across updateAllParentsFields calls, which + // would be incorrect + let memoizationKey = uuid4(); for (const page of pagesToUpdate) { - const parents = await getParents(dataSourceConfig, { - notionId: page.notionPageId, - parentType: page.parentType, - parentId: page.parentId, - }); + const parents = await getParents( + dataSourceConfig, + { + notionId: page.notionPageId, + parentType: page.parentType, + parentId: page.parentId, + }, + memoizationKey + ); await updateDocumentParentsField( dataSourceConfig, @@ -87,7 +104,7 @@ export async function updateAllParentsFields( } } -/** Get ids of all pages whose parents field should be updated: inital pages in +/** Get ids of all pages whose parents field should be updated: initial pages in * documentIds, and all the descendants of documentIds that are pages (including * children of databases) * diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index ba5d5e41ce72..8d4942dcfa91 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -272,11 +272,15 @@ export async function notionUpsertPageActivity( if (parsedPage && parsedPage.hasBody) { upsertTs = new Date().getTime(); const documentId = `notion-${parsedPage.id}`; - const parents = await getParents(dataSourceConfig, { - notionId: pageId, - parentType: parsedPage.parentType, - parentId: parsedPage.parentId, - }); + const parents = await getParents( + dataSourceConfig, + { + notionId: pageId, + parentType: parsedPage.parentType, + parentId: parsedPage.parentId, + }, + runTimestamp.toString() // memoize at notionSyncWorkflow main inner loop level + ); await upsertToDatasource({ dataSourceConfig, documentId, From 458bc7c949d1578b2cd91713a21fcc1f57c03dd3 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 10:48:20 +0200 Subject: [PATCH 17/32] cleaning --- connectors/src/connectors/notion/lib/parents.ts | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 94050ccb339e..73589f122469 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -1,3 +1,4 @@ +import { uuid4 } from "@temporalio/workflow"; import memoize from "lodash.memoize"; import { updateDocumentParentsField } from "@connectors/lib/data_sources"; @@ -12,7 +13,6 @@ import { getNotionPageFromConnectorsDb, getPageChildrenOfDocument, } from "./connectors_db_helpers"; -import { uuid4 } from "@temporalio/workflow"; /** Compute the parents field for a notion document See the [Design * Doc](https://www.notion.so/dust-tt/Engineering-e0f834b5be5a43569baaf76e9c41adf2?p=3d26536a4e0a464eae0c3f8f27a7af97&pm=s) @@ -20,7 +20,7 @@ import { uuid4 } from "@temporalio/workflow"; * core](https://github.com/dust-tt/dust/blob/main/core/src/data_sources/data_source.rs) * for relevant details * - * @param memoizationKey optional key to control memoization of this function + * @param memoizationKey optional key to control memoization of this function (not actually used by the functio) * */ async function _getParents( @@ -30,6 +30,7 @@ async function _getParents( parentType: string | null | undefined; parentId: string | null | undefined; }, + // eslint-disable-next-line @typescript-eslint/no-unused-vars -- used for memoization memoizationKey?: string ): Promise { const parents: string[] = [document.notionId]; @@ -84,7 +85,7 @@ export async function updateAllParentsFields( // Update everybody's parents field. Use of a memoization key to avoid // potentially sharing memoization across updateAllParentsFields calls, which // would be incorrect - let memoizationKey = uuid4(); + const memoizationKey = uuid4(); for (const page of pagesToUpdate) { const parents = await getParents( dataSourceConfig, @@ -120,8 +121,8 @@ async function getPagesToUpdate( let i = 0; while (i < documents.length) { // Visit next document and if it's a page add it to update list - const document = documents[i++]!; - let documentId = notionId(document); + const document = documents[i++] as NotionPage | NotionDatabase; + const documentId = notionId(document); if (document instanceof NotionPage) { pagesToUpdate.push(document); } @@ -129,11 +130,11 @@ async function getPagesToUpdate( // Get children of the document const pageChildren = await getPageChildrenOfDocument( dataSourceConfig, - documentId! + documentId ); const databaseChildren = await getDatabaseChildrenOfDocument( dataSourceConfig, - documentId! + documentId ); // If they haven't yet been visited, add them to documents visited From aaec7797b50ad4e3193b72ad65e65dfeef614008 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 11:42:02 +0200 Subject: [PATCH 18/32] handle void returns from pqueue execution in notion workflow --- .../connectors/notion/temporal/workflows.ts | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 8eb9aeb22ed7..7e46b3ae70b3 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -93,7 +93,7 @@ export async function notionSyncWorkflow( const childWorkflowQueue = new PQueue({ concurrency: MAX_CONCURRENT_CHILD_WORKFLOWS, }); - const promises: Promise[] = []; + const promises: Promise<(UpsertActivityResult | void)[] | void>[] = []; do { const { pageIds, databaseIds, nextCursor } = @@ -174,7 +174,7 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) as Promise + ) ); } } @@ -208,7 +208,7 @@ export async function notionSyncWorkflow( parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE, }) - ) as Promise + ) ); } } @@ -217,9 +217,10 @@ export async function notionSyncWorkflow( // wait for all child workflows to finish const syncWorkflowResults = await Promise.all(promises); + // remove potential void results indicating execution timeouts await updateParentsFieldsActivity( dataSourceConfig, - syncWorkflowResults.flat() + syncWorkflowResults.flat().filter((r) => r) as UpsertActivityResult[] ); if (!isGargageCollectionRun) { @@ -261,12 +262,12 @@ export async function notionSyncResultPageWorkflow( pageIds: string[], runTimestamp: number, isBatchSync = false -): Promise { +): Promise<(UpsertActivityResult | void)[]> { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); - const promises: Promise[] = []; + const promises: Promise[] = []; for (const [pageIndex, pageId] of pageIds.entries()) { const loggerArgs = { @@ -284,7 +285,7 @@ export async function notionSyncResultPageWorkflow( loggerArgs, isBatchSync ) - ) as Promise + ) ); } @@ -296,12 +297,12 @@ export async function notionSyncResultPageDatabaseWorkflow( notionAccessToken: string, databaseIds: string[], runTimestamp: number -): Promise { +): Promise<(UpsertActivityResult | void)[]> { const upsertQueue = new PQueue({ concurrency: MAX_PENDING_UPSERT_ACTIVITIES, }); - const promises: Promise[] = []; + const promises: Promise[] = []; for (const [databaseIndex, databaseId] of databaseIds.entries()) { const loggerArgs = { @@ -318,7 +319,7 @@ export async function notionSyncResultPageDatabaseWorkflow( runTimestamp, loggerArgs ) - ) as Promise + ) ); } return await Promise.all(promises); From 0acb00023311db239e682497116e9663c939182c Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 12:27:58 +0200 Subject: [PATCH 19/32] fix: document is of class object at runtime, not notionpage or notiondb --- connectors/src/connectors/notion/lib/parents.ts | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 73589f122469..c5436dbeed0f 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -123,8 +123,8 @@ async function getPagesToUpdate( // Visit next document and if it's a page add it to update list const document = documents[i++] as NotionPage | NotionDatabase; const documentId = notionId(document); - if (document instanceof NotionPage) { - pagesToUpdate.push(document); + if ((document as NotionPage).notionPageId) { + pagesToUpdate.push(document as NotionPage); } // Get children of the document @@ -150,11 +150,8 @@ async function getPagesToUpdate( } function notionId(document: NotionPage | NotionDatabase): string { - if (document instanceof NotionPage) { - return document.notionPageId; - } else if (document instanceof NotionDatabase) { - return document.notionDatabaseId; - } else { - throw new Error("Unexpected error"); - } + return ( + (document as NotionPage).notionPageId || + (document as NotionDatabase).notionDatabaseId + ); } From ed66a6a4b6b0deec47bf5f168f0e7f936e3337c4 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 13:27:00 +0200 Subject: [PATCH 20/32] cleaning --- connectors/src/lib/data_sources.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index b44a7129ed99..3dfee2fab783 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -213,7 +213,7 @@ export async function updateDocumentParentsField( let dustRequestResult: AxiosResponse; try { - dustRequestResult = await axios.patch( + dustRequestResult = await axios.post( endpoint, { parents: parents, From 51ffbb18fec464f87f427ee1946d6b0944f5dd2d Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 13:27:05 +0200 Subject: [PATCH 21/32] bump notion workflow version --- connectors/src/connectors/notion/temporal/config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connectors/src/connectors/notion/temporal/config.ts b/connectors/src/connectors/notion/temporal/config.ts index 63c2e4dbd69c..516aa88be278 100644 --- a/connectors/src/connectors/notion/temporal/config.ts +++ b/connectors/src/connectors/notion/temporal/config.ts @@ -1,2 +1,2 @@ -export const WORKFLOW_VERSION = 17; +export const WORKFLOW_VERSION = 18; export const QUEUE_NAME = `notion-queue-v${WORKFLOW_VERSION}`; From 7c000fec66a8238bf51c995e4eaabdc1cfafef68 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 14:41:38 +0200 Subject: [PATCH 22/32] pass timestamp to activity execution for better memoization --- connectors/src/connectors/notion/lib/parents.ts | 10 +++++----- .../src/connectors/notion/temporal/activities.ts | 9 +++++++-- connectors/src/connectors/notion/temporal/workflows.ts | 3 ++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index c5436dbeed0f..65edf07de375 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -76,16 +76,16 @@ export const getParents = memoize( export async function updateAllParentsFields( dataSourceConfig: DataSourceConfig, - documents: (NotionPage | NotionDatabase)[] + documents: (NotionPage | NotionDatabase)[], + memoizationKey?: string ) { /* Computing all descendants, then updating, ensures the field is updated only once per page, limiting the load on the Datasource */ const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); - // Update everybody's parents field. Use of a memoization key to avoid - // potentially sharing memoization across updateAllParentsFields calls, which - // would be incorrect - const memoizationKey = uuid4(); + // Update everybody's parents field. Use of a memoization key to control + // sharing memoization across updateAllParentsFields calls, which + // can be desired or not depending on the use case for (const page of pagesToUpdate) { const parents = await getParents( dataSourceConfig, diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 8d4942dcfa91..99f7b86bed5d 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -840,12 +840,17 @@ export async function garbageCollectActivity( export async function updateParentsFieldsActivity( dataSourceConfig: DataSourceConfig, - activitiesResults: UpsertActivityResult[] + activitiesResults: UpsertActivityResult[], + activityExecutionTimestamp: number ) { // Get documents whose path changed (created or moved) If there is // createdOrMoved, then the document cannot be null thus the cast is safe const documents = activitiesResults .filter((aRes) => aRes.createdOrMoved) .map((aRes) => aRes.document) as (NotionPage | NotionDatabase)[]; - await updateAllParentsFields(dataSourceConfig, documents); + await updateAllParentsFields( + dataSourceConfig, + documents, + activityExecutionTimestamp.toString() + ); } diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 7e46b3ae70b3..42b932ccd363 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -220,7 +220,8 @@ export async function notionSyncWorkflow( // remove potential void results indicating execution timeouts await updateParentsFieldsActivity( dataSourceConfig, - syncWorkflowResults.flat().filter((r) => r) as UpsertActivityResult[] + syncWorkflowResults.flat().filter((r) => r) as UpsertActivityResult[], + runTimestamp ); if (!isGargageCollectionRun) { From 83bdaa494f6e76efb927ea5ad45889a5eadc33b3 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 15:00:17 +0200 Subject: [PATCH 23/32] henry's suggestions --- .../notion/lib/connectors_db_helpers.ts | 10 +-- .../src/connectors/notion/lib/parents.ts | 67 +++++++++---------- .../connectors/notion/temporal/activities.ts | 18 ++--- 3 files changed, 43 insertions(+), 52 deletions(-) diff --git a/connectors/src/connectors/notion/lib/connectors_db_helpers.ts b/connectors/src/connectors/notion/lib/connectors_db_helpers.ts index 6234d4ec51f6..31b776f9423b 100644 --- a/connectors/src/connectors/notion/lib/connectors_db_helpers.ts +++ b/connectors/src/connectors/notion/lib/connectors_db_helpers.ts @@ -223,11 +223,8 @@ export async function getNotionDatabaseFromConnectorsDb( * Get children *that are pages* of a given notion page or database * * !! Not children *of a page* - * @param dataSourceInfo - * @param notionId - * @returns */ -export async function getPageChildrenOfDocument( +export async function getPageChildrenOf( dataSourceInfo: DataSourceInfo, notionId: string ): Promise { @@ -254,11 +251,8 @@ export async function getPageChildrenOfDocument( * Get children *that are databases* of a given notion page or database * * !! Not children *of a database* - * @param dataSourceInfo - * @param notionId - * @returns */ -export async function getDatabaseChildrenOfDocument( +export async function getDatabaseChildrenOf( dataSourceInfo: DataSourceInfo, notionId: string ): Promise { diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 65edf07de375..0eaa7e8ace41 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -1,6 +1,10 @@ -import { uuid4 } from "@temporalio/workflow"; import memoize from "lodash.memoize"; +import { + getDatabaseChildrenOf, + getNotionPageFromConnectorsDb, + getPageChildrenOf, +} from "@connectors/connectors/notion/lib/connectors_db_helpers"; import { updateDocumentParentsField } from "@connectors/lib/data_sources"; import { NotionDatabase, NotionPage } from "@connectors/lib/models"; import { @@ -8,13 +12,7 @@ import { DataSourceInfo, } from "@connectors/types/data_source_config"; -import { - getDatabaseChildrenOfDocument, - getNotionPageFromConnectorsDb, - getPageChildrenOfDocument, -} from "./connectors_db_helpers"; - -/** Compute the parents field for a notion document See the [Design +/** Compute the parents field for a notion pageOrDb See the [Design * Doc](https://www.notion.so/dust-tt/Engineering-e0f834b5be5a43569baaf76e9c41adf2?p=3d26536a4e0a464eae0c3f8f27a7af97&pm=s) * and the field documentation [in * core](https://github.com/dust-tt/dust/blob/main/core/src/data_sources/data_source.rs) @@ -25,7 +23,7 @@ import { */ async function _getParents( dataSourceInfo: DataSourceInfo, - document: { + pageOrDb: { notionId: string; parentType: string | null | undefined; parentId: string | null | undefined; @@ -33,8 +31,8 @@ async function _getParents( // eslint-disable-next-line @typescript-eslint/no-unused-vars -- used for memoization memoizationKey?: string ): Promise { - const parents: string[] = [document.notionId]; - switch (document.parentType) { + const parents: string[] = [pageOrDb.notionId]; + switch (pageOrDb.parentType) { case "workspace": return parents; case "block": @@ -47,7 +45,7 @@ async function _getParents( // and add it to the parents array const parent = await getNotionPageFromConnectorsDb( dataSourceInfo, - document.parentId as string // (cannot be null here) + pageOrDb.parentId as string // (cannot be null here) ); if (!parent) { // The parent is either not synced yet (not an issue, see design doc) or @@ -63,25 +61,25 @@ async function _getParents( ); } default: - throw new Error(`Unhandled parent type ${document.parentType}`); + throw new Error(`Unhandled parent type ${pageOrDb.parentType}`); } } export const getParents = memoize( _getParents, - (dataSourceInfo, document, memoizationKey) => { - return `${dataSourceInfo.dataSourceName}:${document.notionId}:${memoizationKey}`; + (dataSourceInfo, pageOrDb, memoizationKey) => { + return `${dataSourceInfo.dataSourceName}:${pageOrDb.notionId}:${memoizationKey}`; } ); export async function updateAllParentsFields( dataSourceConfig: DataSourceConfig, - documents: (NotionPage | NotionDatabase)[], + pageOrDbs: (NotionPage | NotionDatabase)[], memoizationKey?: string ) { /* Computing all descendants, then updating, ensures the field is updated only once per page, limiting the load on the Datasource */ - const pagesToUpdate = await getPagesToUpdate(documents, dataSourceConfig); + const pagesToUpdate = await getPagesToUpdate(pageOrDbs, dataSourceConfig); // Update everybody's parents field. Use of a memoization key to control // sharing memoization across updateAllParentsFields calls, which @@ -106,42 +104,41 @@ export async function updateAllParentsFields( } /** Get ids of all pages whose parents field should be updated: initial pages in - * documentIds, and all the descendants of documentIds that are pages (including + * pageOrDbs, and all the descendants of pageOrDbs that are pages (including * children of databases) * * Note: databases are not stored in the Datasource, so they don't need to be * updated */ async function getPagesToUpdate( - documents: (NotionPage | NotionDatabase)[], + pageOrDbs: (NotionPage | NotionDatabase)[], dataSourceConfig: DataSourceConfig ): Promise { const pagesToUpdate: NotionPage[] = []; let i = 0; - while (i < documents.length) { + while (i < pageOrDbs.length) { // Visit next document and if it's a page add it to update list - const document = documents[i++] as NotionPage | NotionDatabase; - const documentId = notionId(document); - if ((document as NotionPage).notionPageId) { - pagesToUpdate.push(document as NotionPage); + const pageOrDb = pageOrDbs[i++] as NotionPage | NotionDatabase; + const pageOrDbId = notionPageOrDbId(pageOrDb); + if ((pageOrDb as NotionPage).notionPageId) { + pagesToUpdate.push(pageOrDb as NotionPage); } // Get children of the document - const pageChildren = await getPageChildrenOfDocument( - dataSourceConfig, - documentId - ); - const databaseChildren = await getDatabaseChildrenOfDocument( + const pageChildren = await getPageChildrenOf(dataSourceConfig, pageOrDbId); + const databaseChildren = await getDatabaseChildrenOf( dataSourceConfig, - documentId + pageOrDbId ); // If they haven't yet been visited, add them to documents visited // and to the list of documents whose children should be fetched for (const child of [...pageChildren, ...databaseChildren]) { - if (!documents.some((d) => notionId(d) === notionId(child))) { - documents.push(child); + if ( + !pageOrDbs.some((d) => notionPageOrDbId(d) === notionPageOrDbId(child)) + ) { + pageOrDbs.push(child); } } } @@ -149,9 +146,9 @@ async function getPagesToUpdate( return pagesToUpdate; } -function notionId(document: NotionPage | NotionDatabase): string { +function notionPageOrDbId(pageOrDb: NotionPage | NotionDatabase): string { return ( - (document as NotionPage).notionPageId || - (document as NotionDatabase).notionDatabaseId + (pageOrDb as NotionPage).notionPageId || + (pageOrDb as NotionDatabase).notionDatabaseId ); } diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 99f7b86bed5d..9c9ccf3a2008 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -209,7 +209,7 @@ export async function notionGetToSyncActivity( } export type UpsertActivityResult = { - document: NotionPage | NotionDatabase | null; + pageOrDb: NotionPage | NotionDatabase | null; createdOrMoved: boolean; }; @@ -232,7 +232,7 @@ export async function notionUpsertPageActivity( if (alreadySeenInRun) { localLogger.info("Skipping page already seen in this run"); - return { document: notionPage, createdOrMoved: false }; + return { pageOrDb: notionPage, createdOrMoved: false }; } const isSkipped = !!notionPage?.skipReason; @@ -242,7 +242,7 @@ export async function notionUpsertPageActivity( { skipReason: notionPage.skipReason }, "Skipping page with skip reason" ); - return { document: notionPage, createdOrMoved: false }; + return { pageOrDb: notionPage, createdOrMoved: false }; } let upsertTs: number | undefined = undefined; @@ -266,7 +266,7 @@ export async function notionUpsertPageActivity( lastUpsertedTs: upsertTs, skipReason: "body_too_large", }); - return { document: newNotionPage, createdOrMoved }; + return { pageOrDb: newNotionPage, createdOrMoved }; } if (parsedPage && parsedPage.hasBody) { @@ -311,7 +311,7 @@ export async function notionUpsertPageActivity( notionUrl: parsedPage ? parsedPage.url : null, lastUpsertedTs: upsertTs, }); - return { document: newNotionPage, createdOrMoved }; + return { pageOrDb: newNotionPage, createdOrMoved }; } export async function notionUpsertDatabaseActivity( @@ -333,7 +333,7 @@ export async function notionUpsertDatabaseActivity( if (alreadySeenInRun) { localLogger.info("Skipping database already seen in this run"); - return { document: notionDatabase, createdOrMoved: false }; + return { pageOrDb: notionDatabase, createdOrMoved: false }; } const isSkipped = !!notionDatabase?.skipReason; @@ -343,7 +343,7 @@ export async function notionUpsertDatabaseActivity( { skipReason: notionDatabase.skipReason }, "Skipping database with skip reason" ); - return { document: notionDatabase, createdOrMoved: false }; + return { pageOrDb: notionDatabase, createdOrMoved: false }; } localLogger.info( @@ -365,7 +365,7 @@ export async function notionUpsertDatabaseActivity( title: parsedDb ? parsedDb.title : null, notionUrl: parsedDb ? parsedDb.url : null, }); - return { document: newNotionDb, createdOrMoved: createdOrMoved }; + return { pageOrDb: newNotionDb, createdOrMoved: createdOrMoved }; } export async function saveSuccessSyncActivity( @@ -847,7 +847,7 @@ export async function updateParentsFieldsActivity( // createdOrMoved, then the document cannot be null thus the cast is safe const documents = activitiesResults .filter((aRes) => aRes.createdOrMoved) - .map((aRes) => aRes.document) as (NotionPage | NotionDatabase)[]; + .map((aRes) => aRes.pageOrDb) as (NotionPage | NotionDatabase)[]; await updateAllParentsFields( dataSourceConfig, documents, From 16f74e165ddf7e13d4620c7a5d5784fd60e683f3 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 15:29:45 +0200 Subject: [PATCH 24/32] migration script --- .../migrations/20230906_fill_parents_field.ts | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 connectors/migrations/20230906_fill_parents_field.ts diff --git a/connectors/migrations/20230906_fill_parents_field.ts b/connectors/migrations/20230906_fill_parents_field.ts new file mode 100644 index 000000000000..29c2111ae8f2 --- /dev/null +++ b/connectors/migrations/20230906_fill_parents_field.ts @@ -0,0 +1,52 @@ +import { updateAllParentsFields } from "@connectors/connectors/notion/lib/parents"; +import { Connector, NotionDatabase, NotionPage } from "@connectors/lib/models"; + +async function main() { + // if first arg is "all", update all connectors, else update only the + // connector for the corresponding workspace id + const connectors = + process.argv[2] === "all" + ? await Connector.findAll({ + where: { + type: "notion", + }, + }) + : await Connector.findAll({ + where: { + type: "notion", + workspaceId: process.argv[2], + }, + }); + + for (const connector of connectors) { + console.log(`Updating parents field for connector ${connector.id}`); + await updateParentsFieldForConnector(connector); + } +} + +async function updateParentsFieldForConnector(connector: Connector) { + // get all pages and databases for this connector + const pages = await NotionPage.findAll({ + where: { + connectorId: connector.id, + }, + }); + const databases = await NotionDatabase.findAll({ + where: { + connectorId: connector.id, + }, + }); + + // update all parents fields for all pages and databases + updateAllParentsFields(connector, [...pages, ...databases]); +} + +main() + .then(() => { + console.log("Done"); + process.exit(0); + }) + .catch((err) => { + console.error(err); + process.exit(1); + }); From 853d7ea876ce6834b3e2f3da97fd9ea692db0c4a Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 16:29:54 +0200 Subject: [PATCH 25/32] cleaning --- connectors/migrations/20230906_fill_parents_field.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connectors/migrations/20230906_fill_parents_field.ts b/connectors/migrations/20230906_fill_parents_field.ts index 29c2111ae8f2..3f9cae18bb8d 100644 --- a/connectors/migrations/20230906_fill_parents_field.ts +++ b/connectors/migrations/20230906_fill_parents_field.ts @@ -38,7 +38,7 @@ async function updateParentsFieldForConnector(connector: Connector) { }); // update all parents fields for all pages and databases - updateAllParentsFields(connector, [...pages, ...databases]); + await updateAllParentsFields(connector, [...pages, ...databases]); } main() From af02ecb6bd7d817a72a066fdfc3e1f8ea0806367 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 17:41:52 +0200 Subject: [PATCH 26/32] leaner getParents + fixes on potential inconsistencies --- .../src/connectors/notion/lib/parents.ts | 49 +++++++------------ .../connectors/notion/temporal/activities.ts | 13 ++--- .../connectors/notion/temporal/workflows.ts | 2 +- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/connectors/src/connectors/notion/lib/parents.ts b/connectors/src/connectors/notion/lib/parents.ts index 0eaa7e8ace41..5b1190461a0c 100644 --- a/connectors/src/connectors/notion/lib/parents.ts +++ b/connectors/src/connectors/notion/lib/parents.ts @@ -2,6 +2,7 @@ import memoize from "lodash.memoize"; import { getDatabaseChildrenOf, + getNotionDatabaseFromConnectorsDb, getNotionPageFromConnectorsDb, getPageChildrenOf, } from "@connectors/connectors/notion/lib/connectors_db_helpers"; @@ -23,15 +24,19 @@ import { */ async function _getParents( dataSourceInfo: DataSourceInfo, - pageOrDb: { - notionId: string; - parentType: string | null | undefined; - parentId: string | null | undefined; - }, + pageOrDbId: string, // eslint-disable-next-line @typescript-eslint/no-unused-vars -- used for memoization memoizationKey?: string ): Promise { - const parents: string[] = [pageOrDb.notionId]; + const parents: string[] = [pageOrDbId]; + const pageOrDb = + (await getNotionPageFromConnectorsDb(dataSourceInfo, pageOrDbId)) || + (await getNotionDatabaseFromConnectorsDb(dataSourceInfo, pageOrDbId)); + if (!pageOrDb) { + // pageOrDb is either not synced yet (not an issue, see design doc) or + // is not in Dust's scope, in both cases we can just return the page id + return parents; + } switch (pageOrDb.parentType) { case "workspace": return parents; @@ -41,23 +46,10 @@ async function _getParents( return parents; case "page": case "database": { - // retrieve the parent from notion connectors db - // and add it to the parents array - const parent = await getNotionPageFromConnectorsDb( - dataSourceInfo, - pageOrDb.parentId as string // (cannot be null here) - ); - if (!parent) { - // The parent is either not synced yet (not an issue, see design doc) or - // is not in Dust's scope, in both cases we can just return the page id - return parents; - } return parents.concat( - await getParents(dataSourceInfo, { - notionId: parent.notionPageId, - parentType: parent.parentType, - parentId: parent.parentId, - }) + // parentId cannot be undefined here + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + await getParents(dataSourceInfo, pageOrDb.parentId!, memoizationKey) ); } default: @@ -67,8 +59,8 @@ async function _getParents( export const getParents = memoize( _getParents, - (dataSourceInfo, pageOrDb, memoizationKey) => { - return `${dataSourceInfo.dataSourceName}:${pageOrDb.notionId}:${memoizationKey}`; + (dataSourceInfo, pageOrDbId, memoizationKey) => { + return `${dataSourceInfo.dataSourceName}:${pageOrDbId}:${memoizationKey}`; } ); @@ -76,7 +68,7 @@ export async function updateAllParentsFields( dataSourceConfig: DataSourceConfig, pageOrDbs: (NotionPage | NotionDatabase)[], memoizationKey?: string -) { +): Promise { /* Computing all descendants, then updating, ensures the field is updated only once per page, limiting the load on the Datasource */ const pagesToUpdate = await getPagesToUpdate(pageOrDbs, dataSourceConfig); @@ -87,11 +79,7 @@ export async function updateAllParentsFields( for (const page of pagesToUpdate) { const parents = await getParents( dataSourceConfig, - { - notionId: page.notionPageId, - parentType: page.parentType, - parentId: page.parentId, - }, + page.notionPageId, memoizationKey ); @@ -101,6 +89,7 @@ export async function updateAllParentsFields( parents ); } + return pagesToUpdate.length; } /** Get ids of all pages whose parents field should be updated: initial pages in diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index 9c9ccf3a2008..6dffa2b01c09 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -274,11 +274,7 @@ export async function notionUpsertPageActivity( const documentId = `notion-${parsedPage.id}`; const parents = await getParents( dataSourceConfig, - { - notionId: pageId, - parentType: parsedPage.parentType, - parentId: parsedPage.parentId, - }, + pageId, runTimestamp.toString() // memoize at notionSyncWorkflow main inner loop level ); await upsertToDatasource({ @@ -843,14 +839,19 @@ export async function updateParentsFieldsActivity( activitiesResults: UpsertActivityResult[], activityExecutionTimestamp: number ) { + const localLogger = logger.child({ + workspaceId: dataSourceConfig.workspaceId, + dataSourceName: dataSourceConfig.dataSourceName, + }); // Get documents whose path changed (created or moved) If there is // createdOrMoved, then the document cannot be null thus the cast is safe const documents = activitiesResults .filter((aRes) => aRes.createdOrMoved) .map((aRes) => aRes.pageOrDb) as (NotionPage | NotionDatabase)[]; - await updateAllParentsFields( + const nbUpdated = await updateAllParentsFields( dataSourceConfig, documents, activityExecutionTimestamp.toString() ); + localLogger.info({ nbUpdated }, "Updated parents fields."); } diff --git a/connectors/src/connectors/notion/temporal/workflows.ts b/connectors/src/connectors/notion/temporal/workflows.ts index 42b932ccd363..8f4382b658a5 100644 --- a/connectors/src/connectors/notion/temporal/workflows.ts +++ b/connectors/src/connectors/notion/temporal/workflows.ts @@ -221,7 +221,7 @@ export async function notionSyncWorkflow( await updateParentsFieldsActivity( dataSourceConfig, syncWorkflowResults.flat().filter((r) => r) as UpsertActivityResult[], - runTimestamp + new Date().getTime() ); if (!isGargageCollectionRun) { From 9ae414a3f4be6666330f8216e6120be3cc89cf18 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 19:05:19 +0200 Subject: [PATCH 27/32] renaming --- ...ill_parents_field.ts => 20230906_notion_fill_parents_field.ts} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename connectors/migrations/{20230906_fill_parents_field.ts => 20230906_notion_fill_parents_field.ts} (100%) diff --git a/connectors/migrations/20230906_fill_parents_field.ts b/connectors/migrations/20230906_notion_fill_parents_field.ts similarity index 100% rename from connectors/migrations/20230906_fill_parents_field.ts rename to connectors/migrations/20230906_notion_fill_parents_field.ts From 4b73bb858f2ad1595002af505c9a7f45cf70161e Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 19:12:14 +0200 Subject: [PATCH 28/32] Update parents during sync --- connectors/src/connectors/github/temporal/activities.ts | 4 ++-- core/src/data_sources/data_source.rs | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 6fc6295f3c2c..e6f1ad8a6f92 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -154,7 +154,7 @@ export async function githubUpsertIssueActivity( documentUrl: issue.url, timestampMs: lastUpdateTimestamp, tags: tags, - parents: [], + parents: [issueNumber.toString(), repoId.toString()], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, @@ -281,7 +281,7 @@ export async function githubUpsertDiscussionActivity( documentUrl: discussion.url, timestampMs: new Date(discussion.createdAt).getTime(), tags, - parents: [], + parents: [discussionNumber.toString(), repoId.toString()], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index 06fbf68aa7de..c76b44e20c39 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -114,6 +114,9 @@ pub struct Chunk { /// parent in the array, their channel (since a channel does not have any /// parent). /// +/// For github, we store the github issue / discussion's number (as string), and +/// the repo name (its "parent" in the hierarchy). +/// /// Parents are at the time of writing only relevant for managed datasources /// since standard datasources do not allow specifying a hierarchy. A parent is /// represented by a string of characters which correspond to the parent's From 319f946fc6e08f82eb36ea7df0e0cee6838de2d5 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Wed, 6 Sep 2023 19:34:05 +0200 Subject: [PATCH 29/32] migration script --- .../20230906_3_github_fill_parents_field.ts | 101 ++++++++++++++++++ .../connectors/github/temporal/activities.ts | 7 +- 2 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 connectors/migrations/20230906_3_github_fill_parents_field.ts diff --git a/connectors/migrations/20230906_3_github_fill_parents_field.ts b/connectors/migrations/20230906_3_github_fill_parents_field.ts new file mode 100644 index 000000000000..ebe2e7aa52d7 --- /dev/null +++ b/connectors/migrations/20230906_3_github_fill_parents_field.ts @@ -0,0 +1,101 @@ +import { + getDiscussionDocumentId, + getIssueDocumentId, +} from "@connectors/connectors/github/temporal/activities"; +import { updateDocumentParentsField } from "@connectors/lib/data_sources"; +import { + Connector, + GithubDiscussion, + GithubIssue, +} from "@connectors/lib/models"; + +async function main() { + // if first arg is "all", update all connectors, else update only the + // connector for the corresponding workspace id + const connectors = + process.argv[2] === "all" + ? await Connector.findAll({ + where: { + type: "github", + }, + }) + : await Connector.findAll({ + where: { + type: "github", + workspaceId: process.argv[2], + }, + }); + + for (const connector of connectors) { + console.log(`Updating parents field for connector ${connector.id}`); + await updateDiscussionsParentsFieldForConnector(connector); + await updateIssuesParentsFieldForConnector(connector); + } +} + +async function updateDiscussionsParentsFieldForConnector(connector: Connector) { + // get all distinct documentIds and their channel ids from slack messages in + // this connector + const documentData = await GithubDiscussion.findAll({ + where: { + connectorId: connector.id, + }, + attributes: ["repoId", "discussionNumber"], + }); + // update all parents fields for all pages and databases by chunks of 128 + const chunkSize = 128; + for (let i = 0; i < documentData.length; i += chunkSize) { + const chunk = documentData.slice(i, i + chunkSize); + console.log(`Updating ${chunk.length} documents`); + // update parents field for each document of the chunk, in parallel + await Promise.all( + chunk.map(async (document) => { + const docId = getDiscussionDocumentId( + document.repoId, + document.discussionNumber + ); + await updateDocumentParentsField(connector, docId, [ + document.discussionNumber.toString(), + document.repoId, + ]); + }) + ); + } +} + +async function updateIssuesParentsFieldForConnector(connector: Connector) { + // get all distinct documentIds and their channel ids from slack messages in + // this connector + const documentData = await GithubIssue.findAll({ + where: { + connectorId: connector.id, + }, + attributes: ["repoId", "issueNumber"], + }); + // update all parents fields for all pages and databases by chunks of 128 + const chunkSize = 128; + for (let i = 0; i < documentData.length; i += chunkSize) { + const chunk = documentData.slice(i, i + chunkSize); + console.log(`Updating ${chunk.length} documents`); + // update parents field for each document of the chunk, in parallel + await Promise.all( + chunk.map(async (document) => { + const docId = getIssueDocumentId(document.repoId, document.issueNumber); + await updateDocumentParentsField(connector, docId, [ + document.issueNumber.toString(), + document.repoId, + ]); + }) + ); + } +} + +main() + .then(() => { + console.log("Done"); + process.exit(0); + }) + .catch((err) => { + console.error(err); + process.exit(1); + }); diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index e6f1ad8a6f92..668d631f1dbc 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -585,11 +585,14 @@ function renderGithubUser(user: GithubUser | null): string { return `@${user.id}`; } -function getIssueDocumentId(repoId: string, issueNumber: number): string { +export function getIssueDocumentId( + repoId: string, + issueNumber: number +): string { return `github-issue-${repoId}-${issueNumber}`; } -function getDiscussionDocumentId( +export function getDiscussionDocumentId( repoId: string, discussionNumber: number ): string { From de327b6298cfee0d27e849eddc09957e84db9498 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Thu, 7 Sep 2023 18:58:56 +0200 Subject: [PATCH 30/32] comments from pr + adaptations from discussions --- .../20230906_3_github_fill_parents_field.ts | 15 +++--- .../connectors/github/temporal/activities.ts | 18 ++++++- core/src/data_sources/data_source.rs | 48 ++++++++++--------- 3 files changed, 50 insertions(+), 31 deletions(-) diff --git a/connectors/migrations/20230906_3_github_fill_parents_field.ts b/connectors/migrations/20230906_3_github_fill_parents_field.ts index ebe2e7aa52d7..8e8ec823036d 100644 --- a/connectors/migrations/20230906_3_github_fill_parents_field.ts +++ b/connectors/migrations/20230906_3_github_fill_parents_field.ts @@ -10,6 +10,10 @@ import { } from "@connectors/lib/models"; async function main() { + if (!process.argv[2]) { + console.error("Missing workspace id or 'all' as first argument"); + process.exit(1); + } // if first arg is "all", update all connectors, else update only the // connector for the corresponding workspace id const connectors = @@ -42,7 +46,7 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) { }, attributes: ["repoId", "discussionNumber"], }); - // update all parents fields for all pages and databases by chunks of 128 + // update all parents fields for discussions by chunks of 128 const chunkSize = 128; for (let i = 0; i < documentData.length; i += chunkSize) { const chunk = documentData.slice(i, i + chunkSize); @@ -55,7 +59,7 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) { document.discussionNumber ); await updateDocumentParentsField(connector, docId, [ - document.discussionNumber.toString(), + getDiscussionDocumentId(document.repoId, document.discussionNumber), document.repoId, ]); }) @@ -64,15 +68,14 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) { } async function updateIssuesParentsFieldForConnector(connector: Connector) { - // get all distinct documentIds and their channel ids from slack messages in - // this connector + // get all distinct issues and their repo ids fro const documentData = await GithubIssue.findAll({ where: { connectorId: connector.id, }, attributes: ["repoId", "issueNumber"], }); - // update all parents fields for all pages and databases by chunks of 128 + // update all parents fields for all issues by chunks of 128 const chunkSize = 128; for (let i = 0; i < documentData.length; i += chunkSize) { const chunk = documentData.slice(i, i + chunkSize); @@ -82,7 +85,7 @@ async function updateIssuesParentsFieldForConnector(connector: Connector) { chunk.map(async (document) => { const docId = getIssueDocumentId(document.repoId, document.issueNumber); await updateDocumentParentsField(connector, docId, [ - document.issueNumber.toString(), + getIssueDocumentId(document.repoId, document.issueNumber), document.repoId, ]); }) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 668d631f1dbc..6d0b816a0231 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -154,7 +154,14 @@ export async function githubUpsertIssueActivity( documentUrl: issue.url, timestampMs: lastUpdateTimestamp, tags: tags, - parents: [issueNumber.toString(), repoId.toString()], + // The convention for parents is to use the external id string; it is ok for + // repos, but not practical for issues since the external id is the + // issue number, which is not guaranteed unique in the workspace. + // Therefore as a special case we use getIssueDocumentId() to get a parent string + parents: [ + getIssueDocumentId(repoId.toString(), issue.number), + repoId.toString(), + ], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, @@ -281,7 +288,14 @@ export async function githubUpsertDiscussionActivity( documentUrl: discussion.url, timestampMs: new Date(discussion.createdAt).getTime(), tags, - parents: [discussionNumber.toString(), repoId.toString()], + // The convention for parents is to use the external id string; it is ok for + // repos, but not practical for discussions since the external id is the + // issue number, which is not guaranteed unique in the workspace. + // Therefore as a special case we use getDiscussionDocumentId() to get a parent string + parents: [ + getDiscussionDocumentId(repoId.toString(), discussionNumber), + repoId.toString(), + ], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index c76b44e20c39..57d05d650165 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -95,33 +95,35 @@ pub struct Chunk { /// The "parents" field is an array of ids of parents to the document, /// corresponding to its hierarchy, ordered by closest parent first. /// -/// At index 0 is the document id itself, then at index 1 its direct parent, -/// then at index 2 is the direct parent of the element represented at index 1, -/// etc. It is assumed that a document (or folder, or hierarchical level) only -/// has at most one direct parent. Therefore, there is an unambiguous mapping -/// between the parents array and the document's hierarchical position. For -/// example, for a regular file system (or filesystem-like such as Google -/// Drive), each parent would correspond to a subfolder in the path to the -/// document. +/// Parents are at the time of writing only relevant for managed datasources +/// since standard datasources do not allow specifying a hierarchy. A parent is +/// represented by a string of characters which correspond to the parent's +/// external id, provided by the managed datasource’s API (e.g. the Notion id +/// for Notion pages and databases). /// -/// The document’s id is stored in the field, since the field is used in -/// filtering search to search only parts of the hierarchy: it is natural that -/// if the document’s id is selected as a parent filter, the document itself -/// shows up in the search. +/// At index 0 is the string id of the document itself, then at index 1 its +/// direct parent, then at index 2 is the direct parent of the element +/// represented at index 1, etc. It is assumed that a document (or folder, or +/// hierarchical level) only has at most one direct parent. Therefore, there is +/// an unambiguous mapping between the parents array and the document's +/// hierarchical position. For example, for a regular file system (or +/// filesystem-like such as Google Drive), each parent would correspond to a +/// subfolder in the path to the document. /// -/// Note however that the hierarchical system depends on the managed datasource. -/// For example, in the Slack managed datasource, documents only have a single -/// parent in the array, their channel (since a channel does not have any -/// parent). +/// The id of the document itself is stored at index 0 because the field is used +/// in filtering search to search only parts of the hierarchy: it is natural +/// that if the document’s id is selected as a parent filter, the document +/// itself shows up in the search. /// -/// For github, we store the github issue / discussion's number (as string), and -/// the repo name (its "parent" in the hierarchy). +/// Note however that the hierarchical system depends on the managed datasource. +/// For example, in the Slack managed datasource, documents are aggregated +/// messages from a channel. A channel does not have any parent, and there are +/// no slack ids for our slack "documents" so the only value in the parents +/// array is the slack channel id /// -/// Parents are at the time of writing only relevant for managed datasources -/// since standard datasources do not allow specifying a hierarchy. A parent is -/// represented by a string of characters which correspond to the parent's -/// internal id (specific to the managed datasource)--not its name--provided by -/// the managed datasource. +/// For github, github issues / discussions do not have a proper external id, so +/// we use our computed document id. The repo is considered a parent, and has a +/// proper external “repo id”, which is stored at 2nd place in the array #[derive(Debug, Serialize, Clone)] pub struct Document { From 74a96b9d0eed019c9de3c0255a77b598c5d9aee0 Mon Sep 17 00:00:00 2001 From: philipperolet Date: Fri, 8 Sep 2023 11:36:36 +0200 Subject: [PATCH 31/32] last PR comments --- .../migrations/20230906_3_github_fill_parents_field.ts | 4 ++-- connectors/src/connectors/github/temporal/activities.ts | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/connectors/migrations/20230906_3_github_fill_parents_field.ts b/connectors/migrations/20230906_3_github_fill_parents_field.ts index 8e8ec823036d..df42b9f210de 100644 --- a/connectors/migrations/20230906_3_github_fill_parents_field.ts +++ b/connectors/migrations/20230906_3_github_fill_parents_field.ts @@ -47,7 +47,7 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) { attributes: ["repoId", "discussionNumber"], }); // update all parents fields for discussions by chunks of 128 - const chunkSize = 128; + const chunkSize = 32; for (let i = 0; i < documentData.length; i += chunkSize) { const chunk = documentData.slice(i, i + chunkSize); console.log(`Updating ${chunk.length} documents`); @@ -76,7 +76,7 @@ async function updateIssuesParentsFieldForConnector(connector: Connector) { attributes: ["repoId", "issueNumber"], }); // update all parents fields for all issues by chunks of 128 - const chunkSize = 128; + const chunkSize = 32; for (let i = 0; i < documentData.length; i += chunkSize) { const chunk = documentData.slice(i, i + chunkSize); console.log(`Updating ${chunk.length} documents`); diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 6d0b816a0231..c97c7723eb24 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -158,6 +158,8 @@ export async function githubUpsertIssueActivity( // repos, but not practical for issues since the external id is the // issue number, which is not guaranteed unique in the workspace. // Therefore as a special case we use getIssueDocumentId() to get a parent string + // The repo id from github is globally unique so used as-is, as per + // convention to use the external id string. parents: [ getIssueDocumentId(repoId.toString(), issue.number), repoId.toString(), @@ -290,8 +292,10 @@ export async function githubUpsertDiscussionActivity( tags, // The convention for parents is to use the external id string; it is ok for // repos, but not practical for discussions since the external id is the - // issue number, which is not guaranteed unique in the workspace. - // Therefore as a special case we use getDiscussionDocumentId() to get a parent string + // issue number, which is not guaranteed unique in the workspace. Therefore + // as a special case we use getDiscussionDocumentId() to get a parent string + // The repo id from github is globally unique so used as-is, as per + // convention to use the external id string. parents: [ getDiscussionDocumentId(repoId.toString(), discussionNumber), repoId.toString(), From 8e8c090785ba00eb7ff65d884c86dfc8d6ba9fed Mon Sep 17 00:00:00 2001 From: philipperolet Date: Fri, 8 Sep 2023 15:46:16 +0200 Subject: [PATCH 32/32] doc on convention on connector resource --- connectors/src/types/resources.ts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/connectors/src/types/resources.ts b/connectors/src/types/resources.ts index 47f7b7e47d9d..f4d8771f335d 100644 --- a/connectors/src/types/resources.ts +++ b/connectors/src/types/resources.ts @@ -15,8 +15,19 @@ export type ConnectorResourceType = "file" | "folder" | "database" | "channel"; * - Slack: channels * - GoogleDrive: shared drive or sub-folders of shared drives. * - * `internalId` and `parentInternalId` are internal opaque identifiers that should enable - * reconstructing the tree structure of the resources. + * `internalId` and `parentInternalId` are internal opaque identifiers that + * should enable reconstructing the tree structure of the resources. + * + * Those ids must be aligned with those used in the "parents" field of data + * sources documents, to enable search filter on documents based on their + * parents, see the + * + * The convention to use for internal ids are to always use the externally + * provided id when possible (e.g. Notion page id, Github repository id, + * etc...). When not possible, such as for Github issues whose id is not + * workspace-unique, a custom function to create a unique id is created, and + * used both in the parents field management code and the connectors resource + * code. */ export type ConnectorResource = { provider: ConnectorProvider;