From 2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 18 Oct 2024 10:50:20 +0800 Subject: [PATCH] store DateTime as nanoseconds in doc store (#2486) * store DateTime as nanoseconds in doc store The doc store DateTime was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. This is done by adding the trait `ConfigurableBinarySerializable`, which works like `BinarySerializable`, but with a config that allows de/serialize as different date time precision currently. bump version format to 7. add compat test to check the date time truncation. * remove configurable binary serialize, add enum for doc store version * test doc store version ord --- common/src/lib.rs | 2 +- common/src/serialize.rs | 20 ++-- src/compat_tests.rs | 15 ++- src/lib.rs | 2 +- src/schema/document/de.rs | 88 +++++++++++++++--- src/schema/document/se.rs | 75 ++++++--------- src/schema/facet.rs | 2 +- src/store/footer.rs | 19 +++- src/store/mod.rs | 5 +- src/store/reader.rs | 56 +++++++++-- src/store/store_compressor.rs | 8 +- src/tokenizer/tokenized_string.rs | 2 +- .../compat_tests_data/index_v7/.managed.json | 1 + .../index_v7/.tantivy-meta.lock | 0 .../index_v7/.tantivy-writer.lock | 0 .../000002f0000000000000000000000000.fast | Bin 0 -> 146 bytes ...000002f0000000000000000000000000.fieldnorm | Bin 0 -> 113 bytes .../000002f0000000000000000000000000.idx | Bin 0 -> 130 bytes .../000002f0000000000000000000000000.pos | Bin 0 -> 112 bytes .../000002f0000000000000000000000000.store | Bin 0 -> 171 bytes .../000002f0000000000000000000000000.term | Bin 0 -> 349 bytes tests/compat_tests_data/index_v7/meta.json | 40 ++++++++ 22 files changed, 246 insertions(+), 89 deletions(-) create mode 100644 tests/compat_tests_data/index_v7/.managed.json create mode 100644 tests/compat_tests_data/index_v7/.tantivy-meta.lock create mode 100644 tests/compat_tests_data/index_v7/.tantivy-writer.lock create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fast create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fieldnorm create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.idx create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.pos create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.store create mode 100644 tests/compat_tests_data/index_v7/000002f0000000000000000000000000.term create mode 100644 tests/compat_tests_data/index_v7/meta.json diff --git a/common/src/lib.rs b/common/src/lib.rs index 0a51f91fe3..cc4db87671 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -20,7 +20,7 @@ pub use datetime::{DateTime, DateTimePrecision}; pub use group_by::GroupByIteratorExtended; pub use json_path_writer::JsonPathWriter; pub use ownedbytes::{OwnedBytes, StableDeref}; -pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; +pub use serialize::*; pub use vint::{ read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128, }; diff --git a/common/src/serialize.rs b/common/src/serialize.rs index 181d61e54c..c94521d8f4 100644 --- a/common/src/serialize.rs +++ b/common/src/serialize.rs @@ -74,14 +74,14 @@ impl FixedSize for () { impl BinarySerializable for Vec { fn serialize(&self, writer: &mut W) -> io::Result<()> { - VInt(self.len() as u64).serialize(writer)?; + BinarySerializable::serialize(&VInt(self.len() as u64), writer)?; for it in self { it.serialize(writer)?; } Ok(()) } fn deserialize(reader: &mut R) -> io::Result> { - let num_items = VInt::deserialize(reader)?.val(); + let num_items = ::deserialize(reader)?.val(); let mut items: Vec = Vec::with_capacity(num_items as usize); for _ in 0..num_items { let item = T::deserialize(reader)?; @@ -236,12 +236,12 @@ impl FixedSize for bool { impl BinarySerializable for String { fn serialize(&self, writer: &mut W) -> io::Result<()> { let data: &[u8] = self.as_bytes(); - VInt(data.len() as u64).serialize(writer)?; + BinarySerializable::serialize(&VInt(data.len() as u64), writer)?; writer.write_all(data) } fn deserialize(reader: &mut R) -> io::Result { - let string_length = VInt::deserialize(reader)?.val() as usize; + let string_length = ::deserialize(reader)?.val() as usize; let mut result = String::with_capacity(string_length); reader .take(string_length as u64) @@ -253,12 +253,12 @@ impl BinarySerializable for String { impl<'a> BinarySerializable for Cow<'a, str> { fn serialize(&self, writer: &mut W) -> io::Result<()> { let data: &[u8] = self.as_bytes(); - VInt(data.len() as u64).serialize(writer)?; + BinarySerializable::serialize(&VInt(data.len() as u64), writer)?; writer.write_all(data) } fn deserialize(reader: &mut R) -> io::Result> { - let string_length = VInt::deserialize(reader)?.val() as usize; + let string_length = ::deserialize(reader)?.val() as usize; let mut result = String::with_capacity(string_length); reader .take(string_length as u64) @@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> { impl<'a> BinarySerializable for Cow<'a, [u8]> { fn serialize(&self, writer: &mut W) -> io::Result<()> { - VInt(self.len() as u64).serialize(writer)?; + BinarySerializable::serialize(&VInt(self.len() as u64), writer)?; for it in self.iter() { - it.serialize(writer)?; + BinarySerializable::serialize(it, writer)?; } Ok(()) } fn deserialize(reader: &mut R) -> io::Result> { - let num_items = VInt::deserialize(reader)?.val(); + let num_items = ::deserialize(reader)?.val(); let mut items: Vec = Vec::with_capacity(num_items as usize); for _ in 0..num_items { - let item = u8::deserialize(reader)?; + let item = ::deserialize(reader)?; items.push(item); } Ok(Cow::Owned(items)) diff --git a/src/compat_tests.rs b/src/compat_tests.rs index 6e75c5de2a..e77e94527b 100644 --- a/src/compat_tests.rs +++ b/src/compat_tests.rs @@ -44,8 +44,19 @@ fn test_format_6() { assert_date_time_precision(&index, DateTimePrecision::Microseconds); } +/// feature flag quickwit uses a different dictionary type +#[test] +#[cfg(not(feature = "quickwit"))] +fn test_format_7() { + let path = path_for_version("7"); + + let index = Index::open_in_dir(path).expect("Failed to open index"); + // dates are not truncated in v7 in the docstore + assert_date_time_precision(&index, DateTimePrecision::Nanoseconds); +} + #[cfg(not(feature = "quickwit"))] -fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) { +fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) { use collector::TopDocs; let reader = index.reader().expect("Failed to create reader"); let searcher = reader.searcher(); @@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) { .as_datetime() .unwrap(); - let expected = DateTime::from_timestamp_nanos(123456).truncate(precision); + let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision); assert_eq!(date_value, expected,); } diff --git a/src/lib.rs b/src/lib.rs index c34131e0a3..df6ba8d3ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -232,7 +232,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; pub use crate::schema::{Document, TantivyDocument, Term}; /// Index format version. -pub const INDEX_FORMAT_VERSION: u32 = 6; +pub const INDEX_FORMAT_VERSION: u32 = 7; /// Oldest index format version this tantivy version can read. pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4; diff --git a/src/schema/document/de.rs b/src/schema/document/de.rs index e80bff2c9e..01ab0afdc0 100644 --- a/src/schema/document/de.rs +++ b/src/schema/document/de.rs @@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer; use super::{OwnedValue, Value}; use crate::schema::document::type_codes; use crate::schema::{Facet, Field}; +use crate::store::DocStoreVersion; use crate::tokenizer::PreTokenizedString; #[derive(Debug, thiserror::Error, Clone)] @@ -45,6 +46,9 @@ pub enum DeserializeError { #[error("{0}")] /// A custom error message. Custom(String), + #[error("Version {0}, Max version supported: {1}")] + /// Unsupported version error. + UnsupportedVersion(u32, u32), } impl DeserializeError { @@ -291,6 +295,7 @@ pub trait ObjectAccess<'de> { pub struct BinaryDocumentDeserializer<'de, R> { length: usize, position: usize, + doc_store_version: DocStoreVersion, reader: &'de mut R, } @@ -298,12 +303,16 @@ impl<'de, R> BinaryDocumentDeserializer<'de, R> where R: Read { /// Attempts to create a new document deserializer from a given reader. - pub(crate) fn from_reader(reader: &'de mut R) -> Result { + pub(crate) fn from_reader( + reader: &'de mut R, + doc_store_version: DocStoreVersion, + ) -> Result { let length = VInt::deserialize(reader)?; Ok(Self { length: length.val() as usize, position: 0, + doc_store_version, reader, }) } @@ -329,8 +338,8 @@ where R: Read } let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?; - - let deserializer = BinaryValueDeserializer::from_reader(self.reader)?; + let deserializer = + BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?; let value = V::deserialize(deserializer)?; self.position += 1; @@ -344,13 +353,17 @@ where R: Read pub struct BinaryValueDeserializer<'de, R> { value_type: ValueType, reader: &'de mut R, + doc_store_version: DocStoreVersion, } impl<'de, R> BinaryValueDeserializer<'de, R> where R: Read { /// Attempts to create a new value deserializer from a given reader. - fn from_reader(reader: &'de mut R) -> Result { + fn from_reader( + reader: &'de mut R, + doc_store_version: DocStoreVersion, + ) -> Result { let type_code = ::deserialize(reader)?; let value_type = match type_code { @@ -391,7 +404,11 @@ where R: Read } }; - Ok(Self { value_type, reader }) + Ok(Self { + value_type, + reader, + doc_store_version, + }) } fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> { @@ -438,7 +455,16 @@ where R: Read fn deserialize_datetime(self) -> Result { self.validate_type(ValueType::DateTime)?; - ::deserialize(self.reader).map_err(DeserializeError::from) + match self.doc_store_version { + DocStoreVersion::V1 => { + let timestamp_micros = ::deserialize(self.reader)?; + Ok(DateTime::from_timestamp_micros(timestamp_micros)) + } + DocStoreVersion::V2 => { + let timestamp_nanos = ::deserialize(self.reader)?; + Ok(DateTime::from_timestamp_nanos(timestamp_nanos)) + } + } } fn deserialize_facet(self) -> Result { @@ -514,11 +540,13 @@ where R: Read visitor.visit_pre_tokenized_string(val) } ValueType::Array => { - let access = BinaryArrayDeserializer::from_reader(self.reader)?; + let access = + BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?; visitor.visit_array(access) } ValueType::Object => { - let access = BinaryObjectDeserializer::from_reader(self.reader)?; + let access = + BinaryObjectDeserializer::from_reader(self.reader, self.doc_store_version)?; visitor.visit_object(access) } #[allow(deprecated)] @@ -537,7 +565,8 @@ where R: Read let out_rc = std::rc::Rc::new(out); let mut slice: &[u8] = &out_rc; - let access = BinaryObjectDeserializer::from_reader(&mut slice)?; + let access = + BinaryObjectDeserializer::from_reader(&mut slice, self.doc_store_version)?; visitor.visit_object(access) } @@ -551,19 +580,24 @@ pub struct BinaryArrayDeserializer<'de, R> { length: usize, position: usize, reader: &'de mut R, + doc_store_version: DocStoreVersion, } impl<'de, R> BinaryArrayDeserializer<'de, R> where R: Read { /// Attempts to create a new array deserializer from a given reader. - fn from_reader(reader: &'de mut R) -> Result { + fn from_reader( + reader: &'de mut R, + doc_store_version: DocStoreVersion, + ) -> Result { let length = ::deserialize(reader)?; Ok(Self { length: length.val() as usize, position: 0, reader, + doc_store_version, }) } @@ -587,7 +621,8 @@ where R: Read return Ok(None); } - let deserializer = BinaryValueDeserializer::from_reader(self.reader)?; + let deserializer = + BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?; let value = V::deserialize(deserializer)?; // Advance the position cursor. @@ -610,8 +645,11 @@ impl<'de, R> BinaryObjectDeserializer<'de, R> where R: Read { /// Attempts to create a new object deserializer from a given reader. - fn from_reader(reader: &'de mut R) -> Result { - let inner = BinaryArrayDeserializer::from_reader(reader)?; + fn from_reader( + reader: &'de mut R, + doc_store_version: DocStoreVersion, + ) -> Result { + let inner = BinaryArrayDeserializer::from_reader(reader, doc_store_version)?; Ok(Self { inner }) } } @@ -819,6 +857,7 @@ mod tests { use crate::schema::document::existing_type_impls::JsonObjectIter; use crate::schema::document::se::BinaryValueSerializer; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf}; + use crate::store::DOC_STORE_VERSION; fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec { let mut writer = Vec::new(); @@ -829,9 +868,19 @@ mod tests { writer } + fn serialize_owned_value<'a>(value: ReferenceValue<'a, &'a OwnedValue>) -> Vec { + let mut writer = Vec::new(); + + let mut serializer = BinaryValueSerializer::new(&mut writer); + serializer.serialize_value(value).expect("Serialize value"); + + writer + } + fn deserialize_value(buffer: Vec) -> crate::schema::OwnedValue { let mut cursor = Cursor::new(buffer); - let deserializer = BinaryValueDeserializer::from_reader(&mut cursor).unwrap(); + let deserializer = + BinaryValueDeserializer::from_reader(&mut cursor, DOC_STORE_VERSION).unwrap(); crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value") } @@ -1010,6 +1059,17 @@ mod tests { assert_eq!(value, expected_val); } + #[test] + fn test_nested_date_precision() { + let object = OwnedValue::Object(vec![( + "my-date".into(), + OwnedValue::Date(DateTime::from_timestamp_nanos(323456)), + )]); + let result = serialize_owned_value((&object).as_value()); + let value = deserialize_value(result); + assert_eq!(value, object); + } + #[test] fn test_nested_serialize() { let mut object = serde_json::Map::new(); diff --git a/src/schema/document/se.rs b/src/schema/document/se.rs index edc8399b6e..9ad5003b70 100644 --- a/src/schema/document/se.rs +++ b/src/schema/document/se.rs @@ -81,6 +81,15 @@ where W: Write Self { writer } } + fn serialize_with_type_code( + &mut self, + code: u8, + val: &T, + ) -> io::Result<()> { + self.write_type_code(code)?; + BinarySerializable::serialize(val, self.writer) + } + /// Attempts to serialize a given value and write the output /// to the writer. pub(crate) fn serialize_value<'a, V>( @@ -94,56 +103,38 @@ where W: Write ReferenceValue::Leaf(leaf) => match leaf { ReferenceValueLeaf::Null => self.write_type_code(type_codes::NULL_CODE), ReferenceValueLeaf::Str(val) => { - self.write_type_code(type_codes::TEXT_CODE)?; - - let temp_val = Cow::Borrowed(val); - temp_val.serialize(self.writer) + self.serialize_with_type_code(type_codes::TEXT_CODE, &Cow::Borrowed(val)) } ReferenceValueLeaf::U64(val) => { - self.write_type_code(type_codes::U64_CODE)?; - - val.serialize(self.writer) + self.serialize_with_type_code(type_codes::U64_CODE, &val) } ReferenceValueLeaf::I64(val) => { - self.write_type_code(type_codes::I64_CODE)?; - - val.serialize(self.writer) + self.serialize_with_type_code(type_codes::I64_CODE, &val) } ReferenceValueLeaf::F64(val) => { - self.write_type_code(type_codes::F64_CODE)?; - - f64_to_u64(val).serialize(self.writer) + self.serialize_with_type_code(type_codes::F64_CODE, &f64_to_u64(val)) } ReferenceValueLeaf::Date(val) => { self.write_type_code(type_codes::DATE_CODE)?; - val.serialize(self.writer) - } - ReferenceValueLeaf::Facet(val) => { - self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?; - - Cow::Borrowed(val).serialize(self.writer) + let timestamp_nanos: i64 = val.into_timestamp_nanos(); + BinarySerializable::serialize(×tamp_nanos, self.writer) } + ReferenceValueLeaf::Facet(val) => self.serialize_with_type_code( + type_codes::HIERARCHICAL_FACET_CODE, + &Cow::Borrowed(val), + ), ReferenceValueLeaf::Bytes(val) => { - self.write_type_code(type_codes::BYTES_CODE)?; - - let temp_val = Cow::Borrowed(val); - temp_val.serialize(self.writer) + self.serialize_with_type_code(type_codes::BYTES_CODE, &Cow::Borrowed(val)) } ReferenceValueLeaf::IpAddr(val) => { - self.write_type_code(type_codes::IP_CODE)?; - - val.to_u128().serialize(self.writer) + self.serialize_with_type_code(type_codes::IP_CODE, &val.to_u128()) } ReferenceValueLeaf::Bool(val) => { - self.write_type_code(type_codes::BOOL_CODE)?; - - val.serialize(self.writer) + self.serialize_with_type_code(type_codes::BOOL_CODE, &val) } ReferenceValueLeaf::PreTokStr(val) => { self.write_type_code(type_codes::EXT_CODE)?; - self.write_type_code(type_codes::TOK_STR_EXT_CODE)?; - - val.serialize(self.writer) + self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val) } }, ReferenceValue::Array(elements) => { @@ -306,7 +297,6 @@ where W: Write mod tests { use std::collections::BTreeMap; - use common::DateTime; use serde_json::Number; use tokenizer_api::Token; @@ -337,7 +327,10 @@ mod tests { $ext_code.serialize(&mut writer).unwrap(); )? - $value.serialize(&mut writer).unwrap(); + BinarySerializable::serialize( + &$value, + &mut writer, + ).unwrap(); )* writer @@ -355,7 +348,10 @@ mod tests { $ext_code.serialize(&mut writer).unwrap(); )? - $value.serialize(&mut writer).unwrap(); + BinarySerializable::serialize( + &$value, + &mut writer, + ).unwrap(); )* writer @@ -418,15 +414,6 @@ mod tests { "Expected serialized value to match the binary representation" ); - let result = serialize_value(ReferenceValueLeaf::Date(DateTime::MAX).into()); - let expected = binary_repr!( - type_codes::DATE_CODE => DateTime::MAX, - ); - assert_eq!( - result, expected, - "Expected serialized value to match the binary representation" - ); - let facet = Facet::from_text("/hello/world").unwrap(); let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into()); let expected = binary_repr!( diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 275b9cb904..64b5981832 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -4,7 +4,7 @@ use std::io::{self, Read, Write}; use std::str; use std::string::FromUtf8Error; -use common::BinarySerializable; +use common::*; use once_cell::sync::Lazy; use regex::Regex; use serde::de::Error as _; diff --git a/src/store/footer.rs b/src/store/footer.rs index 3505a55e0e..b4cc65a201 100644 --- a/src/store/footer.rs +++ b/src/store/footer.rs @@ -2,12 +2,13 @@ use std::io; use common::{BinarySerializable, FixedSize, HasLen}; -use super::{Decompressor, DOC_STORE_VERSION}; +use super::{Decompressor, DocStoreVersion, DOC_STORE_VERSION}; use crate::directory::FileSlice; #[derive(Debug, Clone, PartialEq)] pub struct DocStoreFooter { pub offset: u64, + pub doc_store_version: DocStoreVersion, pub decompressor: Decompressor, } @@ -25,9 +26,11 @@ impl BinarySerializable for DocStoreFooter { } fn deserialize(reader: &mut R) -> io::Result { - let doc_store_version = u32::deserialize(reader)?; - if doc_store_version != DOC_STORE_VERSION { - panic!("actual doc store version: {doc_store_version}, expected: {DOC_STORE_VERSION}"); + let doc_store_version = DocStoreVersion::deserialize(reader)?; + if doc_store_version > DOC_STORE_VERSION { + panic!( + "actual doc store version: {doc_store_version}, max_supported: {DOC_STORE_VERSION}" + ); } let offset = u64::deserialize(reader)?; let compressor_id = u8::deserialize(reader)?; @@ -35,6 +38,7 @@ impl BinarySerializable for DocStoreFooter { reader.read_exact(&mut skip_buf)?; Ok(DocStoreFooter { offset, + doc_store_version, decompressor: Decompressor::from_id(compressor_id), }) } @@ -45,9 +49,14 @@ impl FixedSize for DocStoreFooter { } impl DocStoreFooter { - pub fn new(offset: u64, decompressor: Decompressor) -> Self { + pub fn new( + offset: u64, + decompressor: Decompressor, + doc_store_version: DocStoreVersion, + ) -> Self { DocStoreFooter { offset, + doc_store_version, decompressor, } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 63327f0739..2a960ff1c0 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -35,15 +35,16 @@ mod footer; mod index; mod reader; mod writer; + pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::decompressors::Decompressor; -pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY; pub use self::reader::{CacheStats, StoreReader}; +pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY}; pub use self::writer::StoreWriter; mod store_compressor; /// Doc store version in footer to handle format changes. -pub(crate) const DOC_STORE_VERSION: u32 = 1; +pub(crate) const DOC_STORE_VERSION: DocStoreVersion = DocStoreVersion::V2; #[cfg(feature = "lz4-compression")] mod compression_lz4_block; diff --git a/src/store/reader.rs b/src/store/reader.rs index dd88b776f7..21e101dab0 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -1,3 +1,4 @@ +use std::fmt::Display; use std::io; use std::iter::Sum; use std::num::NonZeroUsize; @@ -25,9 +26,43 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; type Block = OwnedBytes; +/// The format version of the document store. +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +pub(crate) enum DocStoreVersion { + V1 = 1, + V2 = 2, +} +impl Display for DocStoreVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DocStoreVersion::V1 => write!(f, "V1"), + DocStoreVersion::V2 => write!(f, "V2"), + } + } +} +impl BinarySerializable for DocStoreVersion { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + (*self as u32).serialize(writer) + } + + fn deserialize(reader: &mut R) -> io::Result { + Ok(match u32::deserialize(reader)? { + 1 => DocStoreVersion::V1, + 2 => DocStoreVersion::V2, + v => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid doc store version {}", v), + )) + } + }) + } +} + /// Reads document off tantivy's [`Store`](./index.html) pub struct StoreReader { decompressor: Decompressor, + doc_store_version: DocStoreVersion, data: FileSlice, skip_index: Arc, space_usage: StoreSpaceUsage, @@ -129,6 +164,7 @@ impl StoreReader { let skip_index = SkipIndex::open(index_data); Ok(StoreReader { decompressor: footer.decompressor, + doc_store_version: footer.doc_store_version, data: data_file, cache: BlockCache { cache: NonZeroUsize::new(cache_num_blocks) @@ -203,8 +239,9 @@ impl StoreReader { pub fn get(&self, doc_id: DocId) -> crate::Result { let mut doc_bytes = self.get_document_bytes(doc_id)?; - let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) - .map_err(crate::TantivyError::from)?; + let deserializer = + BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) + .map_err(crate::TantivyError::from)?; D::deserialize(deserializer).map_err(crate::TantivyError::from) } @@ -244,8 +281,9 @@ impl StoreReader { self.iter_raw(alive_bitset).map(|doc_bytes_res| { let mut doc_bytes = doc_bytes_res?; - let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) - .map_err(crate::TantivyError::from)?; + let deserializer = + BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) + .map_err(crate::TantivyError::from)?; D::deserialize(deserializer).map_err(crate::TantivyError::from) }) } @@ -391,8 +429,9 @@ impl StoreReader { ) -> crate::Result { let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?; - let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) - .map_err(crate::TantivyError::from)?; + let deserializer = + BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) + .map_err(crate::TantivyError::from)?; D::deserialize(deserializer).map_err(crate::TantivyError::from) } } @@ -414,6 +453,11 @@ mod tests { doc.get_first(*field).and_then(|f| f.as_value().as_str()) } + #[test] + fn test_doc_store_version_ord() { + assert!(DocStoreVersion::V1 < DocStoreVersion::V2); + } + #[test] fn test_store_lru_cache() -> crate::Result<()> { let directory = RamDirectory::create(); diff --git a/src/store/store_compressor.rs b/src/store/store_compressor.rs index c528790480..ca9f107e6c 100644 --- a/src/store/store_compressor.rs +++ b/src/store/store_compressor.rs @@ -5,6 +5,7 @@ use std::{io, thread}; use common::{BinarySerializable, CountingWriter, TerminatingWrite}; +use super::DOC_STORE_VERSION; use crate::directory::WritePtr; use crate::store::footer::DocStoreFooter; use crate::store::index::{Checkpoint, SkipIndexBuilder}; @@ -143,8 +144,11 @@ impl BlockCompressorImpl { fn close(mut self) -> io::Result<()> { let header_offset: u64 = self.writer.written_bytes(); - let docstore_footer = - DocStoreFooter::new(header_offset, Decompressor::from(self.compressor)); + let docstore_footer = DocStoreFooter::new( + header_offset, + Decompressor::from(self.compressor), + DOC_STORE_VERSION, + ); self.offset_index_writer.serialize_into(&mut self.writer)?; docstore_footer.serialize(&mut self.writer)?; self.writer.terminate() diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index 8fbf51f8c5..e5d67cd9bf 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::io; use std::io::{Read, Write}; -use common::BinarySerializable; +use common::*; use crate::tokenizer::{Token, TokenStream}; diff --git a/tests/compat_tests_data/index_v7/.managed.json b/tests/compat_tests_data/index_v7/.managed.json new file mode 100644 index 0000000000..ab0d201e66 --- /dev/null +++ b/tests/compat_tests_data/index_v7/.managed.json @@ -0,0 +1 @@ +["meta.json","000002f0000000000000000000000000.fieldnorm","000002f0000000000000000000000000.pos","000002f0000000000000000000000000.store","000002f0000000000000000000000000.term","000002f0000000000000000000000000.fast","000002f0000000000000000000000000.idx"] diff --git a/tests/compat_tests_data/index_v7/.tantivy-meta.lock b/tests/compat_tests_data/index_v7/.tantivy-meta.lock new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/compat_tests_data/index_v7/.tantivy-writer.lock b/tests/compat_tests_data/index_v7/.tantivy-writer.lock new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fast b/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fast new file mode 100644 index 0000000000000000000000000000000000000000..8bd1d4318d319bda534570bd4218b3682785290b GIT binary patch literal 146 zcmZQzKmZm56U<@;k_u2UMj*`u!iD8d)k!r0WXHVkOGB`X61 E06twEZvX%Q literal 0 HcmV?d00001 diff --git a/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fieldnorm b/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.fieldnorm new file mode 100644 index 0000000000000000000000000000000000000000..49005c2eb99345e656ccca0515269bcfc3b25abf GIT binary patch literal 113 zcmZQ%Y-(Ttf`&#$5XlQ9s+G!8i;6Sz^OUTrm2wla@{5$L40M!oGxIAzLWJ^{C1^~;^9>D+r literal 0 HcmV?d00001 diff --git a/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.idx b/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.idx new file mode 100644 index 0000000000000000000000000000000000000000..6d29a6b57f7bbc00c82751dfc03cca35c6aa930d GIT binary patch literal 130 zcmZQ%fPjWZMi{H90mN$P0tqzm0*Pv+vecsD%=|nht7@g(#H{=xB`X6RrQFOs5X;C| zN2wsOBsl{tn3*aVqj$Wz}%IRSdyBSUzD3z!Uz;)b$Gt!FaQ9TkSIL> literal 0 HcmV?d00001 diff --git a/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.term b/tests/compat_tests_data/index_v7/000002f0000000000000000000000000.term new file mode 100644 index 0000000000000000000000000000000000000000..f1a6530979e2656da327985e0eb7898e3103a0b2 GIT binary patch literal 349 zcmZQ#Km!7eM=u;da{JK1qn8+=QqoXb9ZJK*VLFldOfW$p4Q4Yjaakbg1F1z;1C|nC zXka*ifDH#=CMrU85@Rz=4^#^S10w??y3I`uAlZg5Ahiv=K%!cyEVZaOGe1wss#+;G zF)P1F$;v=SDK|3@#4Y^}7v(0F#KX0j*Xk%G7bPoM8JZay TnwuCJS{m1e0Zq1KWncgR$B8a4 literal 0 HcmV?d00001 diff --git a/tests/compat_tests_data/index_v7/meta.json b/tests/compat_tests_data/index_v7/meta.json new file mode 100644 index 0000000000..3304b1d52c --- /dev/null +++ b/tests/compat_tests_data/index_v7/meta.json @@ -0,0 +1,40 @@ +{ + "index_settings": { + "docstore_compression": "lz4", + "docstore_blocksize": 16384 + }, + "segments": [ + { + "segment_id": "000002f0-0000-0000-0000-000000000000", + "max_doc": 1, + "deletes": null + } + ], + "schema": [ + { + "name": "label", + "type": "text", + "options": { + "indexing": { + "record": "position", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": true, + "fast": false + } + }, + { + "name": "date", + "type": "date", + "options": { + "indexed": true, + "fieldnorms": true, + "fast": false, + "stored": true, + "precision": "seconds" + } + } + ], + "opstamp": 2 +}