From caf49fb58b67906c5d261e37f9899eb62c20a150 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 24 Jul 2024 16:09:23 +0200 Subject: [PATCH 1/4] support updating doc mapper through api --- .../src/index_config/serialize.rs | 12 +- .../src/default_doc_mapper/tokenizer_entry.rs | 12 +- .../file_backed/file_backed_index/mod.rs | 9 +- .../src/metastore/file_backed/mod.rs | 14 ++ .../src/metastore/index_metadata/mod.rs | 35 ++++- .../quickwit-metastore/src/metastore/mod.rs | 14 +- .../src/metastore/postgres/metastore.rs | 15 ++ .../quickwit-metastore/src/tests/index.rs | 135 +++++++++++++++++- quickwit/quickwit-metastore/src/tests/mod.rs | 14 ++ .../protos/quickwit/metastore.proto | 1 + .../codegen/quickwit/quickwit.metastore.rs | 2 + .../src/index_api/rest_handler.rs | 1 + 12 files changed, 244 insertions(+), 20 deletions(-) diff --git a/quickwit/quickwit-config/src/index_config/serialize.rs b/quickwit/quickwit-config/src/index_config/serialize.rs index cc05728b79c..65257d48d30 100644 --- a/quickwit/quickwit-config/src/index_config/serialize.rs +++ b/quickwit/quickwit-config/src/index_config/serialize.rs @@ -98,12 +98,6 @@ pub fn load_index_config_update( current_index_config.index_uri, new_index_config.index_uri ); - ensure!( - current_index_config - .doc_mapping - .eq_ignore_doc_mapping_uid(&new_index_config.doc_mapping), - "`doc_mapping` cannot be updated" - ); Ok(new_index_config) } @@ -440,12 +434,12 @@ mod test { tokenizer: default record: position "#; - let load_error = load_index_config_update( + let updated_config = load_index_config_update( ConfigFormat::Yaml, updated_config_yaml.as_bytes(), &original_config, ) - .unwrap_err(); - assert!(format!("{:?}", load_error).contains("`doc_mapping` cannot be updated")); + .unwrap(); + assert_eq!(updated_config.doc_mapping.field_mappings.len(), 1); } } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs index 4a47fbab776..46d61ef1ee1 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -26,7 +26,7 @@ use tantivy::tokenizer::{ }; /// A `TokenizerEntry` defines a custom tokenizer with its name and configuration. -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq, Hash, utoipa::ToSchema)] pub struct TokenizerEntry { /// Tokenizer name. pub name: String, @@ -36,7 +36,7 @@ pub struct TokenizerEntry { } /// Tokenizer configuration. -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq, Hash, utoipa::ToSchema)] pub struct TokenizerConfig { #[serde(flatten)] pub(crate) tokenizer_type: TokenizerType, @@ -94,7 +94,7 @@ pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result MetastoreResult { + self.metadata.set_doc_mapping(doc_mapping) + } + /// Stages a single split. /// /// If a split already exists and is in the [SplitState::Staged] state, diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 6157f20fed7..2c2847c4b00 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -44,6 +44,7 @@ use futures::StreamExt; use itertools::Itertools; use quickwit_common::ServiceStream; use quickwit_config::IndexTemplate; +use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::metastore::{ serde_utils, AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, @@ -505,13 +506,26 @@ impl MetastoreService for FileBackedMetastore { let retention_policy_opt = request.deserialize_retention_policy()?; let search_settings = request.deserialize_search_settings()?; let indexing_settings = request.deserialize_indexing_settings()?; + let doc_mapping = request.deserialize_doc_mapping()?; let index_uid = request.index_uid(); + // verify the new mapping is coherent + let doc_mapper_builder = DefaultDocMapperBuilder { + doc_mapping: doc_mapping.clone(), + default_search_fields: search_settings.default_search_fields.clone(), + }; + doc_mapper_builder + .try_build() + .map_err(|e| MetastoreError::InvalidArgument { + message: format!("invalid mapping update: {e}"), + })?; + let index_metadata = self .mutate(index_uid, |index| { let mut mutation_occurred = index.set_retention_policy(retention_policy_opt); mutation_occurred |= index.set_search_settings(search_settings); mutation_occurred |= index.set_indexing_settings(indexing_settings); + mutation_occurred |= index.set_doc_mapping(doc_mapping)?; let index_metadata = index.metadata().clone(); diff --git a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs index c7040f8cfb1..54f15a268c8 100644 --- a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs @@ -20,11 +20,11 @@ pub(crate) mod serialize; use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use quickwit_common::uri::Uri; use quickwit_config::{ - IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, + DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, }; use quickwit_proto::metastore::{EntityKind, MetastoreError, MetastoreResult}; use quickwit_proto::types::{IndexUid, SourceId}; @@ -130,6 +130,37 @@ impl IndexMetadata { } } + /// Replaces the doc mapping in the index config, returning whether a mutation occurred, or + /// possibly an error if this change is not allowed + pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> MetastoreResult { + if self.index_config.doc_mapping != doc_mapping { + if self.index_config.doc_mapping.doc_mapping_uid == doc_mapping.doc_mapping_uid { + let message = "tried to edit doc_mapping keeping the same uid".to_string(); + return Err(MetastoreError::InvalidArgument { message }); + } + if self.index_config.doc_mapping.timestamp_field != doc_mapping.timestamp_field { + let message = "tried to change timestamp field".to_string(); + return Err(MetastoreError::InvalidArgument { message }); + } + { + // TODO: i'm not sure this is necessary, we can relax this requirement once we know + // for sure + let old_tokenizers: HashSet<_> = + self.index_config.doc_mapping.tokenizers.iter().collect(); + let new_tokenizers: HashSet<_> = doc_mapping.tokenizers.iter().collect(); + if !new_tokenizers.is_superset(&old_tokenizers) { + let message = "removed or modified existing customtokenizers".to_string(); + return Err(MetastoreError::InvalidArgument { message }); + } + } + + self.index_config.doc_mapping = doc_mapping; + Ok(true) + } else { + Ok(false) + } + } + /// Adds a source to the index. Returns an error if the source already exists. pub fn add_source(&mut self, source_config: SourceConfig) -> MetastoreResult<()> { match self.sources.entry(source_config.source_id.clone()) { diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 7378faa94fc..cbddbbd4c1f 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -33,7 +33,7 @@ pub use index_metadata::IndexMetadata; use itertools::Itertools; use quickwit_common::thread_pool::run_cpu_intensive; use quickwit_config::{ - IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, + DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, }; use quickwit_doc_mapper::tag_pruning::TagFilterAst; use quickwit_proto::metastore::{ @@ -191,6 +191,7 @@ pub trait UpdateIndexRequestExt { search_settings: &SearchSettings, retention_policy_opt: &Option, indexing_settings: &IndexingSettings, + doc_mapping: &DocMapping, ) -> MetastoreResult; /// Deserializes the `search_settings_json` field of an [`UpdateIndexRequest`] into a @@ -204,6 +205,10 @@ pub trait UpdateIndexRequestExt { /// Deserializes the `indexing_settings_json` field of an [`UpdateIndexRequest`] into a /// [`IndexingSettings`] object. fn deserialize_indexing_settings(&self) -> MetastoreResult; + + /// Deserilalize the `doc_mapping_json` field of an `[UpdateIndexRequest]` into a + /// [`DocMapping`] object. + fn deserialize_doc_mapping(&self) -> MetastoreResult; } impl UpdateIndexRequestExt for UpdateIndexRequest { @@ -212,6 +217,7 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { search_settings: &SearchSettings, retention_policy_opt: &Option, indexing_settings: &IndexingSettings, + doc_mapping: &DocMapping, ) -> MetastoreResult { let search_settings_json = serde_utils::to_json_str(search_settings)?; let retention_policy_json = retention_policy_opt @@ -219,12 +225,14 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { .map(serde_utils::to_json_str) .transpose()?; let indexing_settings_json = serde_utils::to_json_str(indexing_settings)?; + let doc_mapping_json = serde_utils::to_json_str(doc_mapping)?; let update_request = UpdateIndexRequest { index_uid: Some(index_uid.into()), search_settings_json, retention_policy_json, indexing_settings_json, + doc_mapping_json, }; Ok(update_request) } @@ -243,6 +251,10 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { fn deserialize_indexing_settings(&self) -> MetastoreResult { serde_utils::from_json_str(&self.indexing_settings_json) } + + fn deserialize_doc_mapping(&self) -> MetastoreResult { + serde_utils::from_json_str(&self.doc_mapping_json) + } } /// Helper trait to build a [`IndexMetadataResponse`] and deserialize its payload. diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index 6577938624f..c4bd12b8c33 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -30,6 +30,7 @@ use quickwit_config::{ validate_index_id_pattern, IndexTemplate, IndexTemplateId, PostgresMetastoreConfig, INGEST_V2_SOURCE_ID, }; +use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::ingest::{Shard, ShardState}; use quickwit_proto::metastore::{ serde_utils, AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, @@ -399,6 +400,19 @@ impl MetastoreService for PostgresqlMetastore { let retention_policy_opt = request.deserialize_retention_policy()?; let search_settings = request.deserialize_search_settings()?; let indexing_settings = request.deserialize_indexing_settings()?; + let doc_mapping = request.deserialize_doc_mapping()?; + + // verify the new mapping is coherent + let doc_mapper_builder = DefaultDocMapperBuilder { + doc_mapping: doc_mapping.clone(), + default_search_fields: search_settings.default_search_fields.clone(), + }; + doc_mapper_builder + .try_build() + .map_err(|e| MetastoreError::InvalidArgument { + message: format!("invalid mapping update: {e}"), + })?; + let index_uid: IndexUid = request.index_uid().clone(); let updated_index_metadata = run_with_tx!(self.connection_pool, tx, { mutate_index_metadata::(tx, index_uid, |index_metadata| { @@ -406,6 +420,7 @@ impl MetastoreService for PostgresqlMetastore { index_metadata.set_retention_policy(retention_policy_opt); mutation_occurred |= index_metadata.set_search_settings(search_settings); mutation_occurred |= index_metadata.set_indexing_settings(indexing_settings); + mutation_occurred |= index_metadata.set_doc_mapping(doc_mapping)?; Ok(MutationOccurred::from(mutation_occurred)) }) .await diff --git a/quickwit/quickwit-metastore/src/tests/index.rs b/quickwit/quickwit-metastore/src/tests/index.rs index 613c7cd575d..1063224f653 100644 --- a/quickwit/quickwit-metastore/src/tests/index.rs +++ b/quickwit/quickwit-metastore/src/tests/index.rs @@ -31,13 +31,14 @@ use quickwit_config::{ IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, }; +use quickwit_doc_mapper::{Cardinality, FieldMappingEntry, FieldMappingType, QuickwitJsonOptions}; use quickwit_proto::metastore::{ CreateIndexRequest, DeleteIndexRequest, EntityKind, IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataSubrequest, IndexesMetadataRequest, ListIndexesMetadataRequest, MetastoreError, MetastoreService, StageSplitsRequest, UpdateIndexRequest, }; -use quickwit_proto::types::IndexUid; +use quickwit_proto::types::{DocMappingUid, IndexUid}; use super::DefaultForTest; use crate::tests::cleanup_index; @@ -126,6 +127,7 @@ pub async fn test_metastore_update_retention_policy< &index_config.search_settings, &loop_retention_policy_opt, &index_config.indexing_settings, + &index_config.doc_mapping, ) .unwrap(); let response_metadata = metastore @@ -172,6 +174,7 @@ pub async fn test_metastore_update_search_settings< }, &index_config.retention_policy_opt, &index_config.indexing_settings, + &index_config.doc_mapping, ) .unwrap(); let response_metadata = metastore @@ -228,6 +231,7 @@ pub async fn test_metastore_update_indexing_settings< merge_policy: loop_indexing_settings.clone(), ..Default::default() }, + &index_config.doc_mapping, ) .unwrap(); let resp_metadata = metastore @@ -256,6 +260,135 @@ pub async fn test_metastore_update_indexing_settings< cleanup_index(&mut metastore, index_uid).await; } +pub async fn test_metastore_update_doc_mapping< + MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, +>() { + let (mut metastore, index_uid, index_config) = + setup_metastore_for_update::().await; + + let json_options = QuickwitJsonOptions { + description: None, + stored: false, + indexing_options: None, + expand_dots: false, + fast: Default::default(), + }; + + let initial = index_config.doc_mapping.clone(); + let mut new_field = initial.clone(); + new_field.field_mappings.push(FieldMappingEntry { + name: "new_field".to_string(), + mapping_type: FieldMappingType::Json(json_options.clone(), Cardinality::SingleValued), + }); + new_field.doc_mapping_uid = DocMappingUid::random(); + let mut new_field_stored = initial.clone(); + new_field_stored.field_mappings.push(FieldMappingEntry { + name: "new_field".to_string(), + mapping_type: FieldMappingType::Json( + QuickwitJsonOptions { + stored: true, + ..json_options + }, + Cardinality::SingleValued, + ), + }); + new_field_stored.doc_mapping_uid = DocMappingUid::random(); + + for loop_doc_mapping in [initial.clone(), new_field, new_field_stored, initial] { + let index_update = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &index_config.search_settings, + &index_config.retention_policy_opt, + &index_config.indexing_settings, + &loop_doc_mapping, + ) + .unwrap(); + let resp_metadata = metastore + .update_index(index_update) + .await + .unwrap() + .deserialize_index_metadata() + .unwrap(); + assert_eq!(resp_metadata.index_config.doc_mapping, loop_doc_mapping); + let updated_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await + .unwrap() + .deserialize_index_metadata() + .unwrap(); + assert_eq!(updated_metadata.index_config.doc_mapping, loop_doc_mapping); + } + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_update_doc_mapping_fail_case< + MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, +>() { + let (mut metastore, index_uid, index_config) = + setup_metastore_for_update::().await; + + let initial = index_config.doc_mapping.clone(); + let json_options = QuickwitJsonOptions { + description: None, + stored: false, + indexing_options: None, + expand_dots: false, + fast: Default::default(), + }; + + let mut new_field_same_id = initial.clone(); + new_field_same_id.field_mappings.push(FieldMappingEntry { + name: "new_field".to_string(), + mapping_type: FieldMappingType::Json( + QuickwitJsonOptions { + stored: true, + ..json_options + }, + Cardinality::SingleValued, + ), + }); + + let mut timestamp_field_changed = initial.clone(); + timestamp_field_changed.timestamp_field = None; + timestamp_field_changed.doc_mapping_uid = DocMappingUid::random(); + + let mut required_field_disappear = initial.clone(); + required_field_disappear + .field_mappings + .retain(|field| field.name != "timestamp"); + required_field_disappear.doc_mapping_uid = DocMappingUid::random(); + + for loop_doc_mapping in [ + new_field_same_id, + timestamp_field_changed, + required_field_disappear, + ] { + let index_update = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &index_config.search_settings, + &index_config.retention_policy_opt, + &index_config.indexing_settings, + &loop_doc_mapping, + ) + .unwrap(); + let resp = metastore.update_index(index_update).await; + assert!(resp.is_err()); + // verify nothing was persisted + let updated_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await + .unwrap() + .deserialize_index_metadata() + .unwrap(); + assert_eq!(updated_metadata.index_config, index_config); + } + cleanup_index(&mut metastore, index_uid).await; +} + pub async fn test_metastore_create_index_with_sources< MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, >() { diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index b4878a1b978..dda47c0faa8 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -209,6 +209,20 @@ macro_rules! metastore_test_suite { $crate::tests::index::test_metastore_update_search_settings::<$metastore_type>().await; } + #[tokio::test] + #[serial_test::serial] + async fn test_metastore_update_doc_mapping() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::index::test_metastore_update_doc_mapping::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::serial] + async fn test_metastore_update_doc_mapping_fail_case() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::index::test_metastore_update_doc_mapping_fail_case::<$metastore_type>().await; + } + #[tokio::test] #[serial_test::serial] async fn test_metastore_update_indexing_settings() { diff --git a/quickwit/quickwit-proto/protos/quickwit/metastore.proto b/quickwit/quickwit-proto/protos/quickwit/metastore.proto index c542f586787..4daf5b6d171 100644 --- a/quickwit/quickwit-proto/protos/quickwit/metastore.proto +++ b/quickwit/quickwit-proto/protos/quickwit/metastore.proto @@ -216,6 +216,7 @@ message UpdateIndexRequest { string search_settings_json = 2; optional string retention_policy_json = 3; string indexing_settings_json = 4; + string doc_mapping_json = 5; } message ListIndexesMetadataRequest { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs index 191152a91eb..4b3d88ee7ed 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs @@ -32,6 +32,8 @@ pub struct UpdateIndexRequest { pub retention_policy_json: ::core::option::Option<::prost::alloc::string::String>, #[prost(string, tag = "4")] pub indexing_settings_json: ::prost::alloc::string::String, + #[prost(string, tag = "5")] + pub doc_mapping_json: ::prost::alloc::string::String, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[allow(clippy::derive_partial_eq_without_eq)] diff --git a/quickwit/quickwit-serve/src/index_api/rest_handler.rs b/quickwit/quickwit-serve/src/index_api/rest_handler.rs index f51a83c1f0c..39de55d653c 100644 --- a/quickwit/quickwit-serve/src/index_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/index_api/rest_handler.rs @@ -588,6 +588,7 @@ async fn update_index( &new_index_config.search_settings, &new_index_config.retention_policy_opt, &new_index_config.indexing_settings, + &new_index_config.doc_mapping, )?; let update_resp = metastore.update_index(update_request).await?; Ok(update_resp.deserialize_index_metadata()?) From acfce2aa97fe9e5efca5d66c5cba97d9ee34eddf Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 26 Jul 2024 10:47:12 +0200 Subject: [PATCH 2/4] update doc on update api --- docs/configuration/index-config.md | 2 +- docs/reference/rest-api.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 01eb922fa98..e5854ac1b39 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -14,7 +14,7 @@ In addition to the `index_id`, the index configuration lets you define five item - The **search settings**: it defines the default search fields `default_search_fields`, a list of fields that Quickwit will search into if the user query does not explicitly target a field. - The **retention policy**: it defines how long Quickwit should keep the indexed data. If not specified, the data is stored forever. -Configuration is generally set at index creation and cannot be modified, except for some specific attributes like the search settings and retention policy, which can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). +Configuration is set at index creation but can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). ## Config file format diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md index 7835b1b1d8a..27ec0566220 100644 --- a/docs/reference/rest-api.md +++ b/docs/reference/rest-api.md @@ -319,6 +319,7 @@ Updates the configurations of an index. This endpoint follows PUT semantics, whi - The retention policy update is automatically picked up by the janitor service on its next state refresh. - The search settings update is automatically picked up by searcher nodes when the next query is executed. - The indexing settings update is not automatically picked up by the indexer nodes, they need to be manually restarted. +- The doc mapping update is not automatically picked up by the indexer nodes, they have to be manually restarted. #### PUT payload @@ -327,7 +328,7 @@ Updates the configurations of an index. This endpoint follows PUT semantics, whi | `version` | `String` | Config format version, use the same as your Quickwit version. | _required_ | | `index_id` | `String` | Index ID, must be the same index as in the request URI. | _required_ | | `index_uri` | `String` | Defines where the index files are stored. Cannot be updated. | `{current_index_uri}` | -| `doc_mapping` | `DocMapping` | Doc mapping object as specified in the [index config docs](../configuration/index-config.md#doc-mapping). Cannot be updated. | _required_ | +| `doc_mapping` | `DocMapping` | Doc mapping object as specified in the [index config docs](../configuration/index-config.md#doc-mapping). | _required_ | | `indexing_settings` | `IndexingSettings` | Indexing settings object as specified in the [index config docs](../configuration/index-config.md#indexing-settings). | | | `search_settings` | `SearchSettings` | Search settings object as specified in the [index config docs](../configuration/index-config.md#search-settings). | | | `retention` | `Retention` | Retention policy object as specified in the [index config docs](../configuration/index-config.md#retention-policy). | | From b632ca8ae18d3834dd8f3f900c59c0e3373a6ccd Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 26 Jul 2024 12:11:11 +0200 Subject: [PATCH 3/4] move doc mapping update check out of metastore --- docs/configuration/index-config.md | 2 +- docs/reference/rest-api.md | 4 + .../src/index_config/serialize.rs | 163 +++++++++++++++++- .../file_backed/file_backed_index/mod.rs | 2 +- .../src/metastore/file_backed/mod.rs | 14 +- .../src/metastore/index_metadata/mod.rs | 28 +-- .../src/metastore/postgres/metastore.rs | 14 +- .../quickwit-metastore/src/tests/index.rs | 66 ------- quickwit/quickwit-metastore/src/tests/mod.rs | 7 - 9 files changed, 174 insertions(+), 126 deletions(-) diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index e5854ac1b39..60b044d42ce 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -14,7 +14,7 @@ In addition to the `index_id`, the index configuration lets you define five item - The **search settings**: it defines the default search fields `default_search_fields`, a list of fields that Quickwit will search into if the user query does not explicitly target a field. - The **retention policy**: it defines how long Quickwit should keep the indexed data. If not specified, the data is stored forever. -Configuration is set at index creation but can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). +Configuration is set at index creation and can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). ## Config file format diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md index 27ec0566220..a7d02954266 100644 --- a/docs/reference/rest-api.md +++ b/docs/reference/rest-api.md @@ -321,6 +321,10 @@ Updates the configurations of an index. This endpoint follows PUT semantics, whi - The indexing settings update is not automatically picked up by the indexer nodes, they need to be manually restarted. - The doc mapping update is not automatically picked up by the indexer nodes, they have to be manually restarted. +Updating the doc mapping doesn't reindex existing data. Queries and answers are mapped on a best effort basis when querying older splits. +It is also not possible to update the timestamp field, or to modify/remove existing non-default tokenizers (but it is possible to change +which tokenizer is used for a field). + #### PUT payload | Variable | Type | Description | Default value | diff --git a/quickwit/quickwit-config/src/index_config/serialize.rs b/quickwit/quickwit-config/src/index_config/serialize.rs index 65257d48d30..f4e9554ea54 100644 --- a/quickwit/quickwit-config/src/index_config/serialize.rs +++ b/quickwit/quickwit-config/src/index_config/serialize.rs @@ -17,8 +17,11 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use std::collections::HashSet; + use anyhow::{ensure, Context}; use quickwit_common::uri::Uri; +use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::types::{DocMappingUid, IndexId}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -81,7 +84,7 @@ pub fn load_index_config_update( .index_uri .parent() .expect("index URI should have a parent"); - let new_index_config = load_index_config_from_user_config( + let mut new_index_config = load_index_config_from_user_config( config_format, index_config_bytes, current_index_parent_dir, @@ -98,6 +101,63 @@ pub fn load_index_config_update( current_index_config.index_uri, new_index_config.index_uri ); + + // verify the new mapping is coherent + let doc_mapper_builder = DefaultDocMapperBuilder { + doc_mapping: new_index_config.doc_mapping.clone(), + default_search_fields: new_index_config + .search_settings + .default_search_fields + .clone(), + }; + doc_mapper_builder + .try_build() + .context("invalid mapping update")?; + + { + let new_mapping_uid = new_index_config.doc_mapping.doc_mapping_uid; + // we verify whether they are equal ignoring the mapping uid as it is generated at random: + // we don't want to record a mapping change when nothing really happened. + new_index_config.doc_mapping.doc_mapping_uid = + current_index_config.doc_mapping.doc_mapping_uid; + if new_index_config.doc_mapping != current_index_config.doc_mapping { + new_index_config.doc_mapping.doc_mapping_uid = new_mapping_uid; + ensure!( + current_index_config.doc_mapping.doc_mapping_uid + != new_index_config.doc_mapping.doc_mapping_uid, + "`doc_mapping_doc_mapping_uid` must change when the doc mapping is updated", + ); + ensure!( + current_index_config.doc_mapping.timestamp_field + == new_index_config.doc_mapping.timestamp_field, + "`doc_mapping.timestamp_field` cannot be updated, current value {}, new expected \ + value {}", + current_index_config + .doc_mapping + .timestamp_field + .as_deref() + .unwrap_or(""), + new_index_config + .doc_mapping + .timestamp_field + .as_deref() + .unwrap_or(""), + ); + // TODO: i'm not sure this is necessary, we can relax this requirement once we know + // for sure + let current_tokenizers: HashSet<_> = + current_index_config.doc_mapping.tokenizers.iter().collect(); + let new_tokenizers: HashSet<_> = + new_index_config.doc_mapping.tokenizers.iter().collect(); + ensure!( + new_tokenizers.is_superset(¤t_tokenizers), + "`.doc_mapping.tokenizers` must be a superset of previously available tokenizers" + ); + } else { + // the docmapping is unchanged, keep the old uid + } + } + Ok(new_index_config) } @@ -442,4 +502,105 @@ mod test { .unwrap(); assert_eq!(updated_config.doc_mapping.field_mappings.len(), 1); } + + #[test] + fn test_update_doc_mappings_failing_cases() { + let original_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + doc_mapping_uid: 00000000000000000000000000 + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + "#; + let original_config: IndexConfig = load_index_config_from_user_config( + ConfigFormat::Yaml, + original_config_yaml.as_bytes(), + &Uri::for_test("s3://mybucket"), + ) + .unwrap(); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + doc_mapping_uid: 00000000000000000000000000 + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + - name: body + type: text + tokenizer: default + record: position + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("mapping changed but uid fixed should error"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + field_mappings: + - name: timestamp + type: datetime + fast: true + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("timestamp field removed should error"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + timestamp_field: timestamp + field_mappings: + - name: body + type: text + tokenizer: default + record: position + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("field required for timestamp is absent"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + search_settings: + default_search_fields: ["i_dont_exist"] + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("field required for default search is absent"); + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index c151fb3c9c5..dd608e399bd 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -234,7 +234,7 @@ impl FileBackedIndex { /// Replaces the doc mapping in the index config, returning whether a mutation occurred, or /// possibly an error if this change is not allowed - pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> MetastoreResult { + pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> bool { self.metadata.set_doc_mapping(doc_mapping) } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 2c2847c4b00..8a011f65e4c 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -44,7 +44,6 @@ use futures::StreamExt; use itertools::Itertools; use quickwit_common::ServiceStream; use quickwit_config::IndexTemplate; -use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::metastore::{ serde_utils, AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, @@ -509,23 +508,12 @@ impl MetastoreService for FileBackedMetastore { let doc_mapping = request.deserialize_doc_mapping()?; let index_uid = request.index_uid(); - // verify the new mapping is coherent - let doc_mapper_builder = DefaultDocMapperBuilder { - doc_mapping: doc_mapping.clone(), - default_search_fields: search_settings.default_search_fields.clone(), - }; - doc_mapper_builder - .try_build() - .map_err(|e| MetastoreError::InvalidArgument { - message: format!("invalid mapping update: {e}"), - })?; - let index_metadata = self .mutate(index_uid, |index| { let mut mutation_occurred = index.set_retention_policy(retention_policy_opt); mutation_occurred |= index.set_search_settings(search_settings); mutation_occurred |= index.set_indexing_settings(indexing_settings); - mutation_occurred |= index.set_doc_mapping(doc_mapping)?; + mutation_occurred |= index.set_doc_mapping(doc_mapping); let index_metadata = index.metadata().clone(); diff --git a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs index 54f15a268c8..e59693495f6 100644 --- a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs @@ -20,7 +20,7 @@ pub(crate) mod serialize; use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use quickwit_common::uri::Uri; use quickwit_config::{ @@ -132,32 +132,12 @@ impl IndexMetadata { /// Replaces the doc mapping in the index config, returning whether a mutation occurred, or /// possibly an error if this change is not allowed - pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> MetastoreResult { + pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> bool { if self.index_config.doc_mapping != doc_mapping { - if self.index_config.doc_mapping.doc_mapping_uid == doc_mapping.doc_mapping_uid { - let message = "tried to edit doc_mapping keeping the same uid".to_string(); - return Err(MetastoreError::InvalidArgument { message }); - } - if self.index_config.doc_mapping.timestamp_field != doc_mapping.timestamp_field { - let message = "tried to change timestamp field".to_string(); - return Err(MetastoreError::InvalidArgument { message }); - } - { - // TODO: i'm not sure this is necessary, we can relax this requirement once we know - // for sure - let old_tokenizers: HashSet<_> = - self.index_config.doc_mapping.tokenizers.iter().collect(); - let new_tokenizers: HashSet<_> = doc_mapping.tokenizers.iter().collect(); - if !new_tokenizers.is_superset(&old_tokenizers) { - let message = "removed or modified existing customtokenizers".to_string(); - return Err(MetastoreError::InvalidArgument { message }); - } - } - self.index_config.doc_mapping = doc_mapping; - Ok(true) + true } else { - Ok(false) + false } } diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index c4bd12b8c33..0289b10867e 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -30,7 +30,6 @@ use quickwit_config::{ validate_index_id_pattern, IndexTemplate, IndexTemplateId, PostgresMetastoreConfig, INGEST_V2_SOURCE_ID, }; -use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::ingest::{Shard, ShardState}; use quickwit_proto::metastore::{ serde_utils, AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, @@ -402,17 +401,6 @@ impl MetastoreService for PostgresqlMetastore { let indexing_settings = request.deserialize_indexing_settings()?; let doc_mapping = request.deserialize_doc_mapping()?; - // verify the new mapping is coherent - let doc_mapper_builder = DefaultDocMapperBuilder { - doc_mapping: doc_mapping.clone(), - default_search_fields: search_settings.default_search_fields.clone(), - }; - doc_mapper_builder - .try_build() - .map_err(|e| MetastoreError::InvalidArgument { - message: format!("invalid mapping update: {e}"), - })?; - let index_uid: IndexUid = request.index_uid().clone(); let updated_index_metadata = run_with_tx!(self.connection_pool, tx, { mutate_index_metadata::(tx, index_uid, |index_metadata| { @@ -420,7 +408,7 @@ impl MetastoreService for PostgresqlMetastore { index_metadata.set_retention_policy(retention_policy_opt); mutation_occurred |= index_metadata.set_search_settings(search_settings); mutation_occurred |= index_metadata.set_indexing_settings(indexing_settings); - mutation_occurred |= index_metadata.set_doc_mapping(doc_mapping)?; + mutation_occurred |= index_metadata.set_doc_mapping(doc_mapping); Ok(MutationOccurred::from(mutation_occurred)) }) .await diff --git a/quickwit/quickwit-metastore/src/tests/index.rs b/quickwit/quickwit-metastore/src/tests/index.rs index 1063224f653..c01947ba0ba 100644 --- a/quickwit/quickwit-metastore/src/tests/index.rs +++ b/quickwit/quickwit-metastore/src/tests/index.rs @@ -323,72 +323,6 @@ pub async fn test_metastore_update_doc_mapping< cleanup_index(&mut metastore, index_uid).await; } -pub async fn test_metastore_update_doc_mapping_fail_case< - MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, ->() { - let (mut metastore, index_uid, index_config) = - setup_metastore_for_update::().await; - - let initial = index_config.doc_mapping.clone(); - let json_options = QuickwitJsonOptions { - description: None, - stored: false, - indexing_options: None, - expand_dots: false, - fast: Default::default(), - }; - - let mut new_field_same_id = initial.clone(); - new_field_same_id.field_mappings.push(FieldMappingEntry { - name: "new_field".to_string(), - mapping_type: FieldMappingType::Json( - QuickwitJsonOptions { - stored: true, - ..json_options - }, - Cardinality::SingleValued, - ), - }); - - let mut timestamp_field_changed = initial.clone(); - timestamp_field_changed.timestamp_field = None; - timestamp_field_changed.doc_mapping_uid = DocMappingUid::random(); - - let mut required_field_disappear = initial.clone(); - required_field_disappear - .field_mappings - .retain(|field| field.name != "timestamp"); - required_field_disappear.doc_mapping_uid = DocMappingUid::random(); - - for loop_doc_mapping in [ - new_field_same_id, - timestamp_field_changed, - required_field_disappear, - ] { - let index_update = UpdateIndexRequest::try_from_updates( - index_uid.clone(), - &index_config.search_settings, - &index_config.retention_policy_opt, - &index_config.indexing_settings, - &loop_doc_mapping, - ) - .unwrap(); - let resp = metastore.update_index(index_update).await; - assert!(resp.is_err()); - // verify nothing was persisted - let updated_metadata = metastore - .index_metadata(IndexMetadataRequest::for_index_id( - index_uid.index_id.to_string(), - )) - .await - .unwrap() - .deserialize_index_metadata() - .unwrap(); - assert_eq!(updated_metadata.index_config, index_config); - } - cleanup_index(&mut metastore, index_uid).await; -} - pub async fn test_metastore_create_index_with_sources< MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, >() { diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index dda47c0faa8..a9079fdc982 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -216,13 +216,6 @@ macro_rules! metastore_test_suite { $crate::tests::index::test_metastore_update_doc_mapping::<$metastore_type>().await; } - #[tokio::test] - #[serial_test::serial] - async fn test_metastore_update_doc_mapping_fail_case() { - let _ = tracing_subscriber::fmt::try_init(); - $crate::tests::index::test_metastore_update_doc_mapping_fail_case::<$metastore_type>().await; - } - #[tokio::test] #[serial_test::serial] async fn test_metastore_update_indexing_settings() { From 3c0909a16ab53e6aa9896e7b66d74d9dfa416345 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 26 Jul 2024 13:27:22 +0200 Subject: [PATCH 4/4] update comments --- .../src/metastore/file_backed/file_backed_index/mod.rs | 3 +-- .../quickwit-metastore/src/metastore/index_metadata/mod.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index dd608e399bd..120c0574831 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -232,8 +232,7 @@ impl FileBackedIndex { self.metadata.set_indexing_settings(search_settings) } - /// Replaces the doc mapping in the index config, returning whether a mutation occurred, or - /// possibly an error if this change is not allowed + /// Replaces the doc mapping in the index config, returning whether a mutation occurred. pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> bool { self.metadata.set_doc_mapping(doc_mapping) } diff --git a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs index e59693495f6..0d551a26813 100644 --- a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs @@ -130,8 +130,7 @@ impl IndexMetadata { } } - /// Replaces the doc mapping in the index config, returning whether a mutation occurred, or - /// possibly an error if this change is not allowed + /// Replaces the current doc mapping, returning whether a mutation occurred. pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> bool { if self.index_config.doc_mapping != doc_mapping { self.index_config.doc_mapping = doc_mapping;