diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 01eb922fa98..60b044d42ce 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -14,7 +14,7 @@ In addition to the `index_id`, the index configuration lets you define five item - The **search settings**: it defines the default search fields `default_search_fields`, a list of fields that Quickwit will search into if the user query does not explicitly target a field. - The **retention policy**: it defines how long Quickwit should keep the indexed data. If not specified, the data is stored forever. -Configuration is generally set at index creation and cannot be modified, except for some specific attributes like the search settings and retention policy, which can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). +Configuration is set at index creation and can be changed using the [update endpoint](../reference/rest-api.md) or the [CLI](../reference/cli.md). ## Config file format diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md index 7835b1b1d8a..a7d02954266 100644 --- a/docs/reference/rest-api.md +++ b/docs/reference/rest-api.md @@ -319,6 +319,11 @@ Updates the configurations of an index. This endpoint follows PUT semantics, whi - The retention policy update is automatically picked up by the janitor service on its next state refresh. - The search settings update is automatically picked up by searcher nodes when the next query is executed. - The indexing settings update is not automatically picked up by the indexer nodes, they need to be manually restarted. +- The doc mapping update is not automatically picked up by the indexer nodes, they have to be manually restarted. + +Updating the doc mapping doesn't reindex existing data. Queries and answers are mapped on a best effort basis when querying older splits. +It is also not possible to update the timestamp field, or to modify/remove existing non-default tokenizers (but it is possible to change +which tokenizer is used for a field). #### PUT payload @@ -327,7 +332,7 @@ Updates the configurations of an index. This endpoint follows PUT semantics, whi | `version` | `String` | Config format version, use the same as your Quickwit version. | _required_ | | `index_id` | `String` | Index ID, must be the same index as in the request URI. | _required_ | | `index_uri` | `String` | Defines where the index files are stored. Cannot be updated. | `{current_index_uri}` | -| `doc_mapping` | `DocMapping` | Doc mapping object as specified in the [index config docs](../configuration/index-config.md#doc-mapping). Cannot be updated. | _required_ | +| `doc_mapping` | `DocMapping` | Doc mapping object as specified in the [index config docs](../configuration/index-config.md#doc-mapping). | _required_ | | `indexing_settings` | `IndexingSettings` | Indexing settings object as specified in the [index config docs](../configuration/index-config.md#indexing-settings). | | | `search_settings` | `SearchSettings` | Search settings object as specified in the [index config docs](../configuration/index-config.md#search-settings). | | | `retention` | `Retention` | Retention policy object as specified in the [index config docs](../configuration/index-config.md#retention-policy). | | diff --git a/quickwit/quickwit-config/src/index_config/serialize.rs b/quickwit/quickwit-config/src/index_config/serialize.rs index cc05728b79c..f4e9554ea54 100644 --- a/quickwit/quickwit-config/src/index_config/serialize.rs +++ b/quickwit/quickwit-config/src/index_config/serialize.rs @@ -17,8 +17,11 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use std::collections::HashSet; + use anyhow::{ensure, Context}; use quickwit_common::uri::Uri; +use quickwit_doc_mapper::DefaultDocMapperBuilder; use quickwit_proto::types::{DocMappingUid, IndexId}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -81,7 +84,7 @@ pub fn load_index_config_update( .index_uri .parent() .expect("index URI should have a parent"); - let new_index_config = load_index_config_from_user_config( + let mut new_index_config = load_index_config_from_user_config( config_format, index_config_bytes, current_index_parent_dir, @@ -98,12 +101,63 @@ pub fn load_index_config_update( current_index_config.index_uri, new_index_config.index_uri ); - ensure!( - current_index_config - .doc_mapping - .eq_ignore_doc_mapping_uid(&new_index_config.doc_mapping), - "`doc_mapping` cannot be updated" - ); + + // verify the new mapping is coherent + let doc_mapper_builder = DefaultDocMapperBuilder { + doc_mapping: new_index_config.doc_mapping.clone(), + default_search_fields: new_index_config + .search_settings + .default_search_fields + .clone(), + }; + doc_mapper_builder + .try_build() + .context("invalid mapping update")?; + + { + let new_mapping_uid = new_index_config.doc_mapping.doc_mapping_uid; + // we verify whether they are equal ignoring the mapping uid as it is generated at random: + // we don't want to record a mapping change when nothing really happened. + new_index_config.doc_mapping.doc_mapping_uid = + current_index_config.doc_mapping.doc_mapping_uid; + if new_index_config.doc_mapping != current_index_config.doc_mapping { + new_index_config.doc_mapping.doc_mapping_uid = new_mapping_uid; + ensure!( + current_index_config.doc_mapping.doc_mapping_uid + != new_index_config.doc_mapping.doc_mapping_uid, + "`doc_mapping_doc_mapping_uid` must change when the doc mapping is updated", + ); + ensure!( + current_index_config.doc_mapping.timestamp_field + == new_index_config.doc_mapping.timestamp_field, + "`doc_mapping.timestamp_field` cannot be updated, current value {}, new expected \ + value {}", + current_index_config + .doc_mapping + .timestamp_field + .as_deref() + .unwrap_or(""), + new_index_config + .doc_mapping + .timestamp_field + .as_deref() + .unwrap_or(""), + ); + // TODO: i'm not sure this is necessary, we can relax this requirement once we know + // for sure + let current_tokenizers: HashSet<_> = + current_index_config.doc_mapping.tokenizers.iter().collect(); + let new_tokenizers: HashSet<_> = + new_index_config.doc_mapping.tokenizers.iter().collect(); + ensure!( + new_tokenizers.is_superset(¤t_tokenizers), + "`.doc_mapping.tokenizers` must be a superset of previously available tokenizers" + ); + } else { + // the docmapping is unchanged, keep the old uid + } + } + Ok(new_index_config) } @@ -440,12 +494,113 @@ mod test { tokenizer: default record: position "#; - let load_error = load_index_config_update( + let updated_config = load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .unwrap(); + assert_eq!(updated_config.doc_mapping.field_mappings.len(), 1); + } + + #[test] + fn test_update_doc_mappings_failing_cases() { + let original_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + doc_mapping_uid: 00000000000000000000000000 + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + "#; + let original_config: IndexConfig = load_index_config_from_user_config( + ConfigFormat::Yaml, + original_config_yaml.as_bytes(), + &Uri::for_test("s3://mybucket"), + ) + .unwrap(); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + doc_mapping_uid: 00000000000000000000000000 + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + - name: body + type: text + tokenizer: default + record: position + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("mapping changed but uid fixed should error"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + field_mappings: + - name: timestamp + type: datetime + fast: true + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("timestamp field removed should error"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + timestamp_field: timestamp + field_mappings: + - name: body + type: text + tokenizer: default + record: position + "#; + load_index_config_update( + ConfigFormat::Yaml, + updated_config_yaml.as_bytes(), + &original_config, + ) + .expect_err("field required for timestamp is absent"); + + let updated_config_yaml = r#" + version: 0.8 + index_id: hdfs-logs + doc_mapping: + mode: lenient + timestamp_field: timestamp + field_mappings: + - name: timestamp + type: datetime + fast: true + search_settings: + default_search_fields: ["i_dont_exist"] + "#; + load_index_config_update( ConfigFormat::Yaml, updated_config_yaml.as_bytes(), &original_config, ) - .unwrap_err(); - assert!(format!("{:?}", load_error).contains("`doc_mapping` cannot be updated")); + .expect_err("field required for default search is absent"); } } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs index 4a47fbab776..46d61ef1ee1 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -26,7 +26,7 @@ use tantivy::tokenizer::{ }; /// A `TokenizerEntry` defines a custom tokenizer with its name and configuration. -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq, Hash, utoipa::ToSchema)] pub struct TokenizerEntry { /// Tokenizer name. pub name: String, @@ -36,7 +36,7 @@ pub struct TokenizerEntry { } /// Tokenizer configuration. -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq, Hash, utoipa::ToSchema)] pub struct TokenizerConfig { #[serde(flatten)] pub(crate) tokenizer_type: TokenizerType, @@ -94,7 +94,7 @@ pub fn analyze_text(text: &str, tokenizer: &TokenizerConfig) -> anyhow::Result bool { + self.metadata.set_doc_mapping(doc_mapping) + } + /// Stages a single split. /// /// If a split already exists and is in the [SplitState::Staged] state, diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 6157f20fed7..8a011f65e4c 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -505,6 +505,7 @@ impl MetastoreService for FileBackedMetastore { let retention_policy_opt = request.deserialize_retention_policy()?; let search_settings = request.deserialize_search_settings()?; let indexing_settings = request.deserialize_indexing_settings()?; + let doc_mapping = request.deserialize_doc_mapping()?; let index_uid = request.index_uid(); let index_metadata = self @@ -512,6 +513,7 @@ impl MetastoreService for FileBackedMetastore { let mut mutation_occurred = index.set_retention_policy(retention_policy_opt); mutation_occurred |= index.set_search_settings(search_settings); mutation_occurred |= index.set_indexing_settings(indexing_settings); + mutation_occurred |= index.set_doc_mapping(doc_mapping); let index_metadata = index.metadata().clone(); diff --git a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs index c7040f8cfb1..0d551a26813 100644 --- a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs @@ -24,7 +24,7 @@ use std::collections::HashMap; use quickwit_common::uri::Uri; use quickwit_config::{ - IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, + DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, }; use quickwit_proto::metastore::{EntityKind, MetastoreError, MetastoreResult}; use quickwit_proto::types::{IndexUid, SourceId}; @@ -130,6 +130,16 @@ impl IndexMetadata { } } + /// Replaces the current doc mapping, returning whether a mutation occurred. + pub fn set_doc_mapping(&mut self, doc_mapping: DocMapping) -> bool { + if self.index_config.doc_mapping != doc_mapping { + self.index_config.doc_mapping = doc_mapping; + true + } else { + false + } + } + /// Adds a source to the index. Returns an error if the source already exists. pub fn add_source(&mut self, source_config: SourceConfig) -> MetastoreResult<()> { match self.sources.entry(source_config.source_id.clone()) { diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 7378faa94fc..cbddbbd4c1f 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -33,7 +33,7 @@ pub use index_metadata::IndexMetadata; use itertools::Itertools; use quickwit_common::thread_pool::run_cpu_intensive; use quickwit_config::{ - IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, + DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, }; use quickwit_doc_mapper::tag_pruning::TagFilterAst; use quickwit_proto::metastore::{ @@ -191,6 +191,7 @@ pub trait UpdateIndexRequestExt { search_settings: &SearchSettings, retention_policy_opt: &Option, indexing_settings: &IndexingSettings, + doc_mapping: &DocMapping, ) -> MetastoreResult; /// Deserializes the `search_settings_json` field of an [`UpdateIndexRequest`] into a @@ -204,6 +205,10 @@ pub trait UpdateIndexRequestExt { /// Deserializes the `indexing_settings_json` field of an [`UpdateIndexRequest`] into a /// [`IndexingSettings`] object. fn deserialize_indexing_settings(&self) -> MetastoreResult; + + /// Deserilalize the `doc_mapping_json` field of an `[UpdateIndexRequest]` into a + /// [`DocMapping`] object. + fn deserialize_doc_mapping(&self) -> MetastoreResult; } impl UpdateIndexRequestExt for UpdateIndexRequest { @@ -212,6 +217,7 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { search_settings: &SearchSettings, retention_policy_opt: &Option, indexing_settings: &IndexingSettings, + doc_mapping: &DocMapping, ) -> MetastoreResult { let search_settings_json = serde_utils::to_json_str(search_settings)?; let retention_policy_json = retention_policy_opt @@ -219,12 +225,14 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { .map(serde_utils::to_json_str) .transpose()?; let indexing_settings_json = serde_utils::to_json_str(indexing_settings)?; + let doc_mapping_json = serde_utils::to_json_str(doc_mapping)?; let update_request = UpdateIndexRequest { index_uid: Some(index_uid.into()), search_settings_json, retention_policy_json, indexing_settings_json, + doc_mapping_json, }; Ok(update_request) } @@ -243,6 +251,10 @@ impl UpdateIndexRequestExt for UpdateIndexRequest { fn deserialize_indexing_settings(&self) -> MetastoreResult { serde_utils::from_json_str(&self.indexing_settings_json) } + + fn deserialize_doc_mapping(&self) -> MetastoreResult { + serde_utils::from_json_str(&self.doc_mapping_json) + } } /// Helper trait to build a [`IndexMetadataResponse`] and deserialize its payload. diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index 6577938624f..0289b10867e 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -399,6 +399,8 @@ impl MetastoreService for PostgresqlMetastore { let retention_policy_opt = request.deserialize_retention_policy()?; let search_settings = request.deserialize_search_settings()?; let indexing_settings = request.deserialize_indexing_settings()?; + let doc_mapping = request.deserialize_doc_mapping()?; + let index_uid: IndexUid = request.index_uid().clone(); let updated_index_metadata = run_with_tx!(self.connection_pool, tx, { mutate_index_metadata::(tx, index_uid, |index_metadata| { @@ -406,6 +408,7 @@ impl MetastoreService for PostgresqlMetastore { index_metadata.set_retention_policy(retention_policy_opt); mutation_occurred |= index_metadata.set_search_settings(search_settings); mutation_occurred |= index_metadata.set_indexing_settings(indexing_settings); + mutation_occurred |= index_metadata.set_doc_mapping(doc_mapping); Ok(MutationOccurred::from(mutation_occurred)) }) .await diff --git a/quickwit/quickwit-metastore/src/tests/index.rs b/quickwit/quickwit-metastore/src/tests/index.rs index 613c7cd575d..c01947ba0ba 100644 --- a/quickwit/quickwit-metastore/src/tests/index.rs +++ b/quickwit/quickwit-metastore/src/tests/index.rs @@ -31,13 +31,14 @@ use quickwit_config::{ IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, SourceConfig, CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, }; +use quickwit_doc_mapper::{Cardinality, FieldMappingEntry, FieldMappingType, QuickwitJsonOptions}; use quickwit_proto::metastore::{ CreateIndexRequest, DeleteIndexRequest, EntityKind, IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataSubrequest, IndexesMetadataRequest, ListIndexesMetadataRequest, MetastoreError, MetastoreService, StageSplitsRequest, UpdateIndexRequest, }; -use quickwit_proto::types::IndexUid; +use quickwit_proto::types::{DocMappingUid, IndexUid}; use super::DefaultForTest; use crate::tests::cleanup_index; @@ -126,6 +127,7 @@ pub async fn test_metastore_update_retention_policy< &index_config.search_settings, &loop_retention_policy_opt, &index_config.indexing_settings, + &index_config.doc_mapping, ) .unwrap(); let response_metadata = metastore @@ -172,6 +174,7 @@ pub async fn test_metastore_update_search_settings< }, &index_config.retention_policy_opt, &index_config.indexing_settings, + &index_config.doc_mapping, ) .unwrap(); let response_metadata = metastore @@ -228,6 +231,7 @@ pub async fn test_metastore_update_indexing_settings< merge_policy: loop_indexing_settings.clone(), ..Default::default() }, + &index_config.doc_mapping, ) .unwrap(); let resp_metadata = metastore @@ -256,6 +260,69 @@ pub async fn test_metastore_update_indexing_settings< cleanup_index(&mut metastore, index_uid).await; } +pub async fn test_metastore_update_doc_mapping< + MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, +>() { + let (mut metastore, index_uid, index_config) = + setup_metastore_for_update::().await; + + let json_options = QuickwitJsonOptions { + description: None, + stored: false, + indexing_options: None, + expand_dots: false, + fast: Default::default(), + }; + + let initial = index_config.doc_mapping.clone(); + let mut new_field = initial.clone(); + new_field.field_mappings.push(FieldMappingEntry { + name: "new_field".to_string(), + mapping_type: FieldMappingType::Json(json_options.clone(), Cardinality::SingleValued), + }); + new_field.doc_mapping_uid = DocMappingUid::random(); + let mut new_field_stored = initial.clone(); + new_field_stored.field_mappings.push(FieldMappingEntry { + name: "new_field".to_string(), + mapping_type: FieldMappingType::Json( + QuickwitJsonOptions { + stored: true, + ..json_options + }, + Cardinality::SingleValued, + ), + }); + new_field_stored.doc_mapping_uid = DocMappingUid::random(); + + for loop_doc_mapping in [initial.clone(), new_field, new_field_stored, initial] { + let index_update = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &index_config.search_settings, + &index_config.retention_policy_opt, + &index_config.indexing_settings, + &loop_doc_mapping, + ) + .unwrap(); + let resp_metadata = metastore + .update_index(index_update) + .await + .unwrap() + .deserialize_index_metadata() + .unwrap(); + assert_eq!(resp_metadata.index_config.doc_mapping, loop_doc_mapping); + let updated_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await + .unwrap() + .deserialize_index_metadata() + .unwrap(); + assert_eq!(updated_metadata.index_config.doc_mapping, loop_doc_mapping); + } + cleanup_index(&mut metastore, index_uid).await; +} + pub async fn test_metastore_create_index_with_sources< MetastoreToTest: MetastoreService + MetastoreServiceExt + DefaultForTest, >() { diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index b4878a1b978..a9079fdc982 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -209,6 +209,13 @@ macro_rules! metastore_test_suite { $crate::tests::index::test_metastore_update_search_settings::<$metastore_type>().await; } + #[tokio::test] + #[serial_test::serial] + async fn test_metastore_update_doc_mapping() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::index::test_metastore_update_doc_mapping::<$metastore_type>().await; + } + #[tokio::test] #[serial_test::serial] async fn test_metastore_update_indexing_settings() { diff --git a/quickwit/quickwit-proto/protos/quickwit/metastore.proto b/quickwit/quickwit-proto/protos/quickwit/metastore.proto index c542f586787..4daf5b6d171 100644 --- a/quickwit/quickwit-proto/protos/quickwit/metastore.proto +++ b/quickwit/quickwit-proto/protos/quickwit/metastore.proto @@ -216,6 +216,7 @@ message UpdateIndexRequest { string search_settings_json = 2; optional string retention_policy_json = 3; string indexing_settings_json = 4; + string doc_mapping_json = 5; } message ListIndexesMetadataRequest { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs index 191152a91eb..4b3d88ee7ed 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs @@ -32,6 +32,8 @@ pub struct UpdateIndexRequest { pub retention_policy_json: ::core::option::Option<::prost::alloc::string::String>, #[prost(string, tag = "4")] pub indexing_settings_json: ::prost::alloc::string::String, + #[prost(string, tag = "5")] + pub doc_mapping_json: ::prost::alloc::string::String, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[allow(clippy::derive_partial_eq_without_eq)] diff --git a/quickwit/quickwit-serve/src/index_api/rest_handler.rs b/quickwit/quickwit-serve/src/index_api/rest_handler.rs index f51a83c1f0c..39de55d653c 100644 --- a/quickwit/quickwit-serve/src/index_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/index_api/rest_handler.rs @@ -588,6 +588,7 @@ async fn update_index( &new_index_config.search_settings, &new_index_config.retention_policy_opt, &new_index_config.indexing_settings, + &new_index_config.doc_mapping, )?; let update_resp = metastore.update_index(update_request).await?; Ok(update_resp.deserialize_index_metadata()?)