diff --git a/src/lazy/any_encoding.rs b/src/lazy/any_encoding.rs index 3a643c81..35870dd2 100644 --- a/src/lazy/any_encoding.rs +++ b/src/lazy/any_encoding.rs @@ -44,6 +44,7 @@ use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; +use crate::lazy::streaming_raw_reader::RawReaderState; use crate::lazy::text::raw::r#struct::{ LazyRawTextFieldName_1_0, LazyRawTextStruct_1_0, RawTextStructIterator_1_0, }; @@ -599,20 +600,20 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { } } - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + fn save_state(&self) -> RawReaderState<'data> { use RawReaderKind::*; - let (remaining_data, stream_offset, mut encoding) = match &self.encoding_reader { - Text_1_0(r) => r.stream_data(), - Binary_1_0(r) => r.stream_data(), - Text_1_1(r) => r.stream_data(), - Binary_1_1(r) => r.stream_data(), + let reader_state = match &self.encoding_reader { + Text_1_0(r) => r.save_state(), + Binary_1_0(r) => r.save_state(), + Text_1_1(r) => r.save_state(), + Binary_1_1(r) => r.save_state(), }; // If we hit an IVM that changed the encoding but we haven't changed our reader yet, // we still want to report the new encoding. if let Some(new_encoding) = self.new_encoding { - encoding = new_encoding; + return RawReaderState::new(reader_state.data(), reader_state.offset(), new_encoding); } - (remaining_data, stream_offset, encoding) + reader_state } fn next<'top>( @@ -625,9 +626,12 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { // If we previously ran into an IVM that changed the stream encoding, replace our reader // with one that can read the new encoding. if let Some(new_encoding) = self.new_encoding.take() { - let (remaining_data, stream_offset, _) = self.stream_data(); - let new_encoding_reader = - RawReaderKind::resume_at_offset(remaining_data, stream_offset, new_encoding); + let reader_state = self.save_state(); + let new_encoding_reader = RawReaderKind::resume_at_offset( + reader_state.data(), + reader_state.offset(), + new_encoding, + ); self.encoding_reader = new_encoding_reader; } @@ -1149,10 +1153,12 @@ impl<'top> LazyContainerPrivate<'top, AnyEncoding> for LazyRawAnyList<'top> { } } +#[derive(Debug, Copy, Clone)] pub struct RawAnyListIterator<'data> { encoding: RawAnyListIteratorKind<'data>, } +#[derive(Debug, Copy, Clone)] pub enum RawAnyListIteratorKind<'data> { Text_1_0(RawTextListIterator_1_0<'data>), Binary_1_0(RawBinarySequenceIterator_1_0<'data>), @@ -1310,10 +1316,12 @@ impl<'data> LazyContainerPrivate<'data, AnyEncoding> for LazyRawAnySExp<'data> { } } +#[derive(Debug, Copy, Clone)] pub struct RawAnySExpIterator<'data> { encoding: RawAnySExpIteratorKind<'data>, } +#[derive(Debug, Copy, Clone)] pub enum RawAnySExpIteratorKind<'data> { Text_1_0(RawTextSExpIterator_1_0<'data>), Binary_1_0(RawBinarySequenceIterator_1_0<'data>), @@ -1513,10 +1521,12 @@ impl<'top> From> for LazyRawAnyFieldName<'top> } } +#[derive(Debug, Copy, Clone)] pub struct RawAnyStructIterator<'data> { encoding: RawAnyStructIteratorKind<'data>, } +#[derive(Debug, Copy, Clone)] pub enum RawAnyStructIteratorKind<'data> { Text_1_0(RawTextStructIterator_1_0<'data>), Binary_1_0(RawBinaryStructIterator_1_0<'data>), diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index e663d3bc..cbb31542 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -10,6 +10,7 @@ use crate::{Encoding, IonResult}; use crate::lazy::any_encoding::IonEncoding; use crate::lazy::expanded::EncodingContextRef; +use crate::lazy::streaming_raw_reader::RawReaderState; /// A binary Ion 1.0 reader that yields [`LazyRawBinaryValue_1_0`]s representing the top level values found /// in the provided input stream. @@ -118,9 +119,9 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0 } } - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + fn save_state(&self) -> RawReaderState<'data> { let stream_offset = self.position(); - ( + RawReaderState::new( &self.data.buffer.bytes()[self.data.bytes_to_skip..], stream_offset, IonEncoding::Binary_1_0, @@ -148,6 +149,7 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0 /// Wraps an [`ImmutableBuffer`], allowing the reader to advance each time an item is successfully /// parsed from it. +#[derive(Debug, Copy, Clone)] pub(crate) struct DataSource<'data> { // The buffer we're reading from buffer: ImmutableBuffer<'data>, diff --git a/src/lazy/binary/raw/sequence.rs b/src/lazy/binary/raw/sequence.rs index 80113cb0..1ed9943b 100644 --- a/src/lazy/binary/raw/sequence.rs +++ b/src/lazy/binary/raw/sequence.rs @@ -139,6 +139,7 @@ impl<'a> Debug for LazyRawBinarySequence_1_0<'a> { } } +#[derive(Debug, Copy, Clone)] pub struct RawBinarySequenceIterator_1_0<'top> { source: DataSource<'top>, } diff --git a/src/lazy/binary/raw/struct.rs b/src/lazy/binary/raw/struct.rs index ab247811..72a2a463 100644 --- a/src/lazy/binary/raw/struct.rs +++ b/src/lazy/binary/raw/struct.rs @@ -86,6 +86,7 @@ impl<'top> LazyRawStruct<'top, BinaryEncoding_1_0> for LazyRawBinaryStruct_1_0<' } } +#[derive(Debug, Copy, Clone)] pub struct RawBinaryStructIterator_1_0<'top> { source: DataSource<'top>, } diff --git a/src/lazy/binary/raw/v1_1/e_expression.rs b/src/lazy/binary/raw/v1_1/e_expression.rs index 183ca389..b70f3cda 100644 --- a/src/lazy/binary/raw/v1_1/e_expression.rs +++ b/src/lazy/binary/raw/v1_1/e_expression.rs @@ -281,8 +281,9 @@ impl<'top> Iterator for BinaryEExpArgsInputIter<'top> { let expr = EExpArgExpr::ArgGroup(BinaryEExpArgGroup::new(parameter, input, 0)); (EExpArg::new(parameter, expr), self.remaining_args_buffer) } - // If it's a tagged value expression, parse it as usual. + // It's a single expression; we'll need to look at the parameter's declared encoding. ArgGrouping::ValueExprLiteral => match parameter.encoding() { + // The encoding starts with an opcode. ParameterEncoding::Tagged => { let (expr, remaining) = try_or_some_err! { self @@ -291,6 +292,7 @@ impl<'top> Iterator for BinaryEExpArgsInputIter<'top> { }; (EExpArg::new(parameter, expr), remaining) } + // It's a FlexUInt. ParameterEncoding::FlexUInt => { let (flex_uint_lazy_value, remaining) = try_or_some_err! { self.remaining_args_buffer.read_flex_uint_as_lazy_value() @@ -304,7 +306,7 @@ impl<'top> Iterator for BinaryEExpArgsInputIter<'top> { EExpArg::new(parameter, EExpArgExpr::ValueLiteral(value_ref)), remaining, ) - } + } // TODO: The other tagless encodings }, // If it's an argument group... ArgGrouping::ArgGroup => { diff --git a/src/lazy/binary/raw/v1_1/immutable_buffer.rs b/src/lazy/binary/raw/v1_1/immutable_buffer.rs index c0b9478e..c2dc99d2 100644 --- a/src/lazy/binary/raw/v1_1/immutable_buffer.rs +++ b/src/lazy/binary/raw/v1_1/immutable_buffer.rs @@ -247,7 +247,7 @@ impl<'a> ImmutableBuffer<'a> { } // XXX: This *doesn't* slice `self` because FlexUInt::read() is faster if the input // is at least the size of a u64. - let matched_input = self; + let matched_input = self.slice(0, size_in_bytes); let remaining_input = self.slice_to_end(size_in_bytes); let value = LazyRawBinaryValue_1_1::for_flex_uint(matched_input); Ok((value, remaining_input)) @@ -622,7 +622,7 @@ impl<'a> ImmutableBuffer<'a> { let header = opcode .to_header() .ok_or_else(|| IonError::decoding_error("found a non-value in value position .."))?; - + let header_offset = input.offset(); let (total_length, length_length, value_body_length, delimited_contents) = if opcode.is_delimited_start() { let (contents, after) = input.peek_delimited_container(opcode)?; let total_length = after.offset() - self.offset(); @@ -646,10 +646,17 @@ impl<'a> ImmutableBuffer<'a> { (total_length, length_length, value_length, DelimitedContents::None) }; - let header_offset = input.offset(); + if total_length > input.len() { + return IonResult::incomplete( + "the stream ended unexpectedly in the middle of a value", + header_offset, + ); + } + let encoded_value = EncodedValue { encoding: ParameterEncoding::Tagged, header, + // If applicable, these are populated by the caller: `read_annotated_value()` annotations_header_length: 0, annotations_sequence_length: 0, annotations_encoding: AnnotationsEncoding::SymbolAddress, diff --git a/src/lazy/binary/raw/v1_1/reader.rs b/src/lazy/binary/raw/v1_1/reader.rs index e460d127..21519d43 100644 --- a/src/lazy/binary/raw/v1_1/reader.rs +++ b/src/lazy/binary/raw/v1_1/reader.rs @@ -8,6 +8,7 @@ use crate::lazy::encoder::private::Sealed; use crate::lazy::encoding::BinaryEncoding_1_1; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; +use crate::lazy::streaming_raw_reader::RawReaderState; use crate::{Encoding, IonResult}; pub struct LazyRawBinaryReader_1_1<'data> { @@ -107,8 +108,8 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_1> for LazyRawBinaryReader_1_1 Self::new_with_offset(data, offset) } - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { - ( + fn save_state(&self) -> RawReaderState<'data> { + RawReaderState::new( &self.input[self.local_offset..], self.position(), self.encoding(), diff --git a/src/lazy/binary/raw/v1_1/sequence.rs b/src/lazy/binary/raw/v1_1/sequence.rs index ec527102..6e1057e6 100644 --- a/src/lazy/binary/raw/v1_1/sequence.rs +++ b/src/lazy/binary/raw/v1_1/sequence.rs @@ -22,8 +22,16 @@ pub struct LazyRawBinarySExp_1_1<'top> { impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinaryList_1_1<'top> { fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { + let delimited_expr_cache = match value.delimited_contents { + DelimitedContents::None => None, + DelimitedContents::Values(values) => Some(values), + DelimitedContents::Fields(_) => unreachable!("sequence contained fields"), + }; LazyRawBinaryList_1_1 { - sequence: LazyRawBinarySequence_1_1 { value }, + sequence: LazyRawBinarySequence_1_1 { + value, + delimited_expr_cache, + }, } } } @@ -52,8 +60,16 @@ impl<'top> LazyRawSequence<'top, BinaryEncoding_1_1> for LazyRawBinaryList_1_1<' impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinarySExp_1_1<'top> { fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { + let delimited_expr_cache = match value.delimited_contents { + DelimitedContents::None => None, + DelimitedContents::Values(values) => Some(values), + DelimitedContents::Fields(_) => unreachable!("sequence contained fields"), + }; LazyRawBinarySExp_1_1 { - sequence: LazyRawBinarySequence_1_1 { value }, + sequence: LazyRawBinarySequence_1_1 { + value, + delimited_expr_cache, + }, } } } @@ -83,11 +99,18 @@ impl<'top> LazyRawSequence<'top, BinaryEncoding_1_1> for LazyRawBinarySExp_1_1<' #[derive(Copy, Clone)] pub struct LazyRawBinarySequence_1_1<'top> { pub(crate) value: &'top LazyRawBinaryValue_1_1<'top>, + pub(crate) delimited_expr_cache: Option<&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]>, } impl<'top> LazyRawBinarySequence_1_1<'top> { - pub fn new(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { - Self { value } + pub fn new( + value: &'top LazyRawBinaryValue_1_1<'top>, + delimited_expr_cache: Option<&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]>, + ) -> Self { + Self { + value, + delimited_expr_cache, + } } pub fn ion_type(&self) -> IonType { @@ -102,7 +125,7 @@ impl<'top> LazyRawBinarySequence_1_1<'top> { } else { self.value.value_body_buffer() }; - RawBinarySequenceIterator_1_1::new(buffer_slice, self.value.delimited_contents) + RawBinarySequenceIterator_1_1::new(buffer_slice, self.delimited_expr_cache) } } @@ -139,26 +162,24 @@ impl<'a> Debug for LazyRawBinarySequence_1_1<'a> { } } +#[derive(Debug, Copy, Clone)] pub struct RawBinarySequenceIterator_1_1<'top> { source: ImmutableBuffer<'top>, bytes_to_skip: usize, - delimited_contents: DelimitedContents<'top>, - delimited_iter: Option>>, + delimited_expr_cache: Option<&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]>, + expr_cache_index: usize, } impl<'top> RawBinarySequenceIterator_1_1<'top> { pub(crate) fn new( input: ImmutableBuffer<'top>, - delimited_contents: DelimitedContents<'top>, + delimited_expr_cache: Option<&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]>, ) -> RawBinarySequenceIterator_1_1<'top> { RawBinarySequenceIterator_1_1 { source: input, bytes_to_skip: 0, - delimited_contents, - delimited_iter: match &delimited_contents { - DelimitedContents::Values(vals) => Some(vals.iter()), - _ => None, - }, + delimited_expr_cache, + expr_cache_index: 0, } } } @@ -167,14 +188,17 @@ impl<'top> Iterator for RawBinarySequenceIterator_1_1<'top> { type Item = IonResult>; fn next(&mut self) -> Option { - if let Some(ref mut inner_iter) = &mut self.delimited_iter { - inner_iter.next().map(|val| Ok(*val)) + if let Some(expr_cache) = self.delimited_expr_cache { + let expr = expr_cache.get(self.expr_cache_index)?; + self.expr_cache_index += 1; + Some(Ok(*expr)) } else { self.source = self.source.consume(self.bytes_to_skip); - let (maybe_item, remaining_input) = try_or_some_err!(self.source.read_sequence_value_expr()); + let (maybe_item, remaining_input) = + try_or_some_err!(self.source.read_sequence_value_expr()); if let Some(item) = maybe_item { self.source = remaining_input; - return Some(Ok(item)) + return Some(Ok(item)); } None } diff --git a/src/lazy/binary/raw/v1_1/struct.rs b/src/lazy/binary/raw/v1_1/struct.rs index a85c9fcd..c140e7ca 100644 --- a/src/lazy/binary/raw/v1_1/struct.rs +++ b/src/lazy/binary/raw/v1_1/struct.rs @@ -57,6 +57,8 @@ impl<'top> LazyRawFieldName<'top, BinaryEncoding_1_1> for LazyRawBinaryFieldName #[derive(Copy, Clone)] pub struct LazyRawBinaryStruct_1_1<'top> { pub(crate) value: &'top LazyRawBinaryValue_1_1<'top>, + pub(crate) delimited_field_expr_cache: + Option<&'top [LazyRawFieldExpr<'top, BinaryEncoding_1_1>]>, } impl<'a, 'top> IntoIterator for &'a LazyRawBinaryStruct_1_1<'top> { @@ -91,14 +93,14 @@ impl<'top> LazyRawBinaryStruct_1_1<'top> { RawBinaryStructIterator_1_1::new( self.value.encoded_value.header.ion_type_code, self.value.input.consume(1), - self.value.delimited_contents, + self.delimited_field_expr_cache, ) } else { let buffer_slice = self.value.value_body_buffer(); RawBinaryStructIterator_1_1::new( self.value.encoded_value.header.ion_type_code, buffer_slice, - self.value.delimited_contents, + None, ) } } @@ -106,7 +108,15 @@ impl<'top> LazyRawBinaryStruct_1_1<'top> { impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinaryStruct_1_1<'top> { fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { - LazyRawBinaryStruct_1_1 { value } + let delimited_field_expr_cache = match value.delimited_contents { + DelimitedContents::None => None, + DelimitedContents::Fields(fields) => Some(fields), + DelimitedContents::Values(_) => unreachable!("struct contained sequence values"), + }; + LazyRawBinaryStruct_1_1 { + value, + delimited_field_expr_cache, + } } } @@ -128,7 +138,7 @@ impl<'top> LazyRawStruct<'top, BinaryEncoding_1_1> for LazyRawBinaryStruct_1_1<' } } -#[derive(Clone, Copy)] +#[derive(Debug, Copy, Clone)] enum StructMode { FlexSym, SymbolAddress, @@ -139,18 +149,19 @@ enum SymAddressFieldName<'top> { FieldName(LazyRawBinaryFieldName_1_1<'top>), } +#[derive(Debug, Copy, Clone)] pub struct RawBinaryStructIterator_1_1<'top> { source: ImmutableBuffer<'top>, mode: StructMode, - delimited_contents: DelimitedContents<'top>, - delimited_iter: Option>>, + field_expr_index: usize, + field_expr_cache: Option<&'top [LazyRawFieldExpr<'top, BinaryEncoding_1_1>]>, } impl<'top> RawBinaryStructIterator_1_1<'top> { pub(crate) fn new( opcode_type: OpcodeType, input: ImmutableBuffer<'top>, - delimited_contents: DelimitedContents<'top>, + field_expr_cache: Option<&'top [LazyRawFieldExpr<'top, BinaryEncoding_1_1>]>, ) -> RawBinaryStructIterator_1_1<'top> { RawBinaryStructIterator_1_1 { source: input, @@ -159,11 +170,8 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { OpcodeType::StructDelimited => StructMode::FlexSym, _ => unreachable!("Unexpected opcode for structure"), }, - delimited_contents, - delimited_iter: match &delimited_contents { - DelimitedContents::Fields(fields) => Some(fields.iter()), - _ => None, - }, + field_expr_cache, + field_expr_index: 0, } } @@ -188,7 +196,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { _ => unreachable!(), }; - let matched_field_id = buffer.slice(0, after.offset() - buffer.offset()); + let matched_field_id = buffer.slice(0, flex_sym.size_in_bytes()); let field_name = LazyRawBinaryFieldName_1_1::new(sym, matched_field_id); Ok(Some((field_name, after))) } @@ -297,8 +305,10 @@ impl<'top> Iterator for RawBinaryStructIterator_1_1<'top> { type Item = IonResult>; fn next(&mut self) -> Option { - if let Some(ref mut inner_iter) = &mut self.delimited_iter { - inner_iter.next().map(|val| Ok(*val)) + if let Some(field_cache) = self.field_expr_cache { + let field = field_cache.get(self.field_expr_index)?; + self.field_expr_index += 1; + Some(Ok(*field)) } else { let (field_expr, after, mode) = match Self::peek_field(self.source, self.mode) { Ok((Some((value, mode)), after)) => (Some(Ok(value)), after, mode), diff --git a/src/lazy/binary/raw/v1_1/value.rs b/src/lazy/binary/raw/v1_1/value.rs index c7f9ef26..8765c72c 100644 --- a/src/lazy/binary/raw/v1_1/value.rs +++ b/src/lazy/binary/raw/v1_1/value.rs @@ -3,9 +3,12 @@ use std::fmt::Debug; use std::ops::Range; +use num_traits::PrimInt; + use crate::lazy::binary::raw::v1_1::immutable_buffer::AnnotationsEncoding; use crate::lazy::binary::raw::v1_1::r#struct::LazyRawBinaryStruct_1_1; use crate::lazy::binary::raw::v1_1::sequence::{LazyRawBinaryList_1_1, LazyRawBinarySExp_1_1}; +use crate::lazy::binary::raw::value::EncodedBinaryValue; use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::{HasRange, HasSpan, RawVersionMarker}; use crate::lazy::expanded::template::ParameterEncoding; @@ -36,7 +39,6 @@ use crate::{ Decimal, Int, IonEncoding, IonError, IonResult, IonType, LazyExpandedList, LazyExpandedSExp, LazyExpandedStruct, LazyList, LazySExp, LazyStruct, RawSymbolRef, SymbolRef, ValueRef, }; -use num_traits::PrimInt; const LONG_TIMESTAMP_OFFSET_BIAS: i32 = -60 * 24; @@ -93,19 +95,6 @@ impl<'top> RawVersionMarker<'top> for LazyRawBinaryVersionMarker_1_1<'top> { } } -#[derive(Debug, Copy, Clone)] -pub enum DelimitedContents<'top> { - None, - Values(&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]), - Fields(&'top [LazyRawFieldExpr<'top, BinaryEncoding_1_1>]), -} - -impl<'top> DelimitedContents<'top> { - pub fn is_none(&self) -> bool { - matches!(self, Self::None) - } -} - #[derive(Debug, Copy, Clone)] pub struct LazyRawBinaryValue_1_1<'top> { pub(crate) encoded_value: EncodedValue
, @@ -279,6 +268,19 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for &'top LazyRawBinaryValue_1 } } +#[derive(Debug, Copy, Clone)] +pub enum DelimitedContents<'top> { + None, + Values(&'top [LazyRawValueExpr<'top, BinaryEncoding_1_1>]), + Fields(&'top [LazyRawFieldExpr<'top, BinaryEncoding_1_1>]), +} + +impl<'top> DelimitedContents<'top> { + pub fn is_none(&self) -> bool { + matches!(self, Self::None) + } +} + impl<'top> LazyRawBinaryValue_1_1<'top> { /// Constructs a lazy raw binary value from an input buffer slice that has been found to contain /// a complete `FlexUInt`. @@ -288,7 +290,10 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { header: Header { // It is an int, that's true. ion_type: IonType::Int, - // Nonsense values for now + // Eventually we'll refactor `EncodedValue` to accommodate values that don't have + // a header (i.e., parameters with tagless encodings). See: + // https://github.com/amazon-ion/ion-rust/issues/805 + // For now, we'll populate these fields with nonsense values and ignore them. ion_type_code: OpcodeType::Nop, low_nibble: 0, }, @@ -343,8 +348,8 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { /// Reads this value's data, returning it as a [`RawValueRef`]. If this value is a container, /// calling this method will not read additional data; the `RawValueRef` will provide a - /// [`LazyRawBinarySequence_1_1`](crate::lazy::binary::raw::v1_1::sequence::LazyRawBinarySequence_1_1) - /// or [`LazyStruct`] that can be traversed to access the container's contents. + /// lazy sequence or lazy struct that can be traversed to access the container's + /// contents. pub fn read(&'top self) -> ValueParseResult<'top, BinaryEncoding_1_1> { <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::read(&self) } @@ -839,3 +844,35 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { Ok(LazyRawBinaryStruct_1_1::from_value(self)) } } + +impl<'top> EncodedBinaryValue<'top, BinaryEncoding_1_1> for &'top LazyRawBinaryValue_1_1<'top> { + fn opcode_length(&self) -> usize { + self.encoded_value.opcode_length as usize + } + + fn length_length(&self) -> usize { + self.encoded_value.length_length as usize + } + + fn body_length(&self) -> usize { + self.encoded_value.value_body_length + } + + fn annotations_sequence_length(&self) -> usize { + self.encoded_value.annotations_sequence_length() + } + + fn annotations_sequence_length_span(&self) -> Span<'top> { + let header_span = self.annotations_header_span(); + let sequence_length_offset = header_span.range().start + 1; + let sequence_length_bytes = &header_span.bytes()[sequence_length_offset..]; + Span::with_offset(sequence_length_offset, sequence_length_bytes) + } + + fn annotations_wrapper_length_span(&self) -> Span<'top> { + // Ion 1.1 does not include an encoded wrapper length, so we return an empty span + // that follows the opcode. (This parallels the location of the wrapper length + // subfield found in Ion 1.0.) + Span::with_offset(self.annotations_span().range().start + 1, &[]) + } +} diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 24b15ffa..298490a9 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -1,5 +1,9 @@ #![allow(non_camel_case_types)] +use std::fmt::{Debug, Formatter}; +use std::ops::Range; +use std::{fmt, mem}; + use crate::binary::int::DecodedInt; use crate::binary::uint::DecodedUInt; use crate::lazy::binary::encoded_value::EncodedValue; @@ -17,10 +21,9 @@ use crate::lazy::span::Span; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; use crate::types::SymbolId; -use crate::{Decimal, Int, IonEncoding, IonError, IonResult, IonType, RawSymbolRef, Timestamp}; -use std::fmt::{Debug, Formatter}; -use std::ops::Range; -use std::{fmt, mem}; +use crate::{ + Decimal, Decoder, Int, IonEncoding, IonError, IonResult, IonType, RawSymbolRef, Timestamp, +}; #[derive(Debug, Copy, Clone)] pub struct LazyRawBinaryVersionMarker_1_0<'top> { @@ -140,6 +143,127 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_0> for LazyRawBinaryValue_1_0<'to } } +pub trait EncodedBinaryValue<'top, D: Decoder>: LazyRawValue<'top, D> { + fn opcode_length(&self) -> usize; + fn length_length(&self) -> usize; + fn body_length(&self) -> usize; + fn annotations_sequence_length(&self) -> usize; + + /// The span containing the annotations sequence's opcode. + fn annotations_opcode_span(&self) -> Span<'top> { + let annotations_span = self.annotations_span(); + if annotations_span.is_empty() { + return annotations_span; + } + Span::with_offset( + annotations_span.range().start, + &annotations_span.bytes()[0..1], + ) + } + /// The span containing the annotations sequence's encoded length. In Ion 1.1, the sequence + /// length is optional; if it is not present, the returned span will be empty. + fn annotations_sequence_length_span(&self) -> Span<'top>; + + /// The span containing the annotations sequence wrapper's length. This span will always be + /// empty for Ion 1.1 values. + fn annotations_wrapper_length_span(&self) -> Span<'top>; + /// The span containing both the opcode and length(s), but not the annotations sequence. + fn annotations_header_span(&self) -> Span<'top> { + let annotations_span = self.annotations_span(); + let sequence_length = self.annotations_sequence_length(); + let local_end = annotations_span.len() - sequence_length; + let bytes = &annotations_span.bytes()[..local_end]; + Span::with_offset(annotations_span.range().start, bytes) + } + /// The span containing the annotations sequence. + fn annotations_sequence_span(&self) -> Span<'top> { + let annotations_span = self.annotations_span(); + let sequence_length = self.annotations_sequence_length(); + let local_sequence_offset = annotations_span.len() - sequence_length; + let bytes = &annotations_span.bytes()[local_sequence_offset..]; + Span::with_offset( + annotations_span.range().start + local_sequence_offset, + bytes, + ) + } + /// The span containing the value's opcode. + fn value_opcode_span(&self) -> Span<'top> { + let value_span = self.value_span(); + Span::with_offset( + value_span.range().start, + &value_span.bytes()[0..self.opcode_length()], + ) + } + + /// The span containing the value's encoded length (if it is not encoded within the opcode.) + fn value_length_span(&self) -> Span<'top> { + let value_span = self.value_span(); + let value_range = value_span.range(); + let opcode_length = self.opcode_length(); + let length_length = self.length_length(); + let length_bytes = &value_span.bytes()[opcode_length..opcode_length + length_length]; + Span::with_offset(value_range.start + opcode_length, length_bytes) + } + + /// The span containing the value's opcode and length. + fn value_header_span(&self) -> Span<'top> { + let value_span = self.value_span(); + let opcode_length = self.opcode_length(); + let length_length = self.length_length(); + let header_bytes = &value_span.bytes()[..opcode_length + length_length]; + Span::with_offset(value_span.range().start, header_bytes) + } + + /// The span containing the value's body. + fn value_body_span(&self) -> Span<'top> { + let value_span = self.value_span(); + let body_length = self.body_length(); + let body_bytes = &value_span.bytes()[value_span.len() - body_length..]; + Span::with_offset(value_span.range().end - body_length, body_bytes) + } +} + +impl<'top> EncodedBinaryValue<'top, BinaryEncoding_1_0> for LazyRawBinaryValue_1_0<'top> { + fn opcode_length(&self) -> usize { + self.encoded_value.opcode_length as usize + } + + fn length_length(&self) -> usize { + self.encoded_value.length_length as usize + } + + fn body_length(&self) -> usize { + self.encoded_value.value_body_length + } + + fn annotations_sequence_length(&self) -> usize { + self.encoded_value.annotations_sequence_length() + } + + fn annotations_sequence_length_span(&self) -> Span<'top> { + let header_span = self.annotations_header_span(); + let wrapper_length_span = self.annotations_wrapper_length_span(); + let sequence_length_offset = wrapper_length_span.range().end; + let sequence_length_bytes = &header_span.bytes()[sequence_length_offset..]; + Span::with_offset(sequence_length_offset, sequence_length_bytes) + } + + fn annotations_wrapper_length_span(&self) -> Span<'top> { + let annotations_span = self.annotations_span(); + let wrapper_length_input = &annotations_span.bytes()[1..]; + // Don't read the VarUInt, but skim along looking for the END flag + let mut num_varuint_bytes = 1; + for &byte in wrapper_length_input { + if byte >= 0b1000_0000 { + break; + } + num_varuint_bytes += 1; + } + let sequence_length_bytes = &wrapper_length_input[..num_varuint_bytes]; + Span::with_offset(annotations_span.range().start + 1, sequence_length_bytes) + } +} + #[derive(Copy, Clone)] pub struct EncodedBinaryAnnotations_1_0<'a, 'top> { value: &'a LazyRawBinaryValue_1_0<'top>, diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index be6c1830..951fc875 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -14,6 +14,7 @@ use crate::lazy::expanded::{EncodingContext, EncodingContextRef}; use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; +use crate::lazy::streaming_raw_reader::RawReaderState; use crate::read_config::ReadConfig; use crate::result::IonFailure; use crate::{ @@ -38,6 +39,12 @@ pub trait HasRange { } } +impl HasRange for Range { + fn range(&self) -> Range { + self.start..self.end + } +} + /// A family of types that collectively comprise the lazy reader API for an Ion serialization /// format. These types operate at the 'raw' level; they do not attempt to resolve symbols /// using the active symbol table. @@ -440,7 +447,7 @@ pub trait LazyRawReader<'data, D: Decoder>: Sized { fn resume_at_offset(data: &'data [u8], offset: usize, encoding_hint: IonEncoding) -> Self; /// Deconstructs this reader, returning a tuple of `(remaining_data, stream_offset, encoding)`. - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding); + fn save_state(&self) -> RawReaderState<'data>; fn next<'top>( &'top mut self, @@ -576,15 +583,53 @@ pub trait LazyRawValue<'top, D: Decoder>: fn value_span(&self) -> Span<'top>; } +pub trait RawSequenceIterator<'top, D: Decoder>: + Debug + Copy + Clone + Iterator>> +{ + /// Returns the next raw value expression (or `None` if exhausted) without advancing the iterator. + fn peek_next(&self) -> Option>> { + // Because RawSequenceIterator impls are `Copy`, we can make a cheap copy of `self` and advance + // *it* without affecting `self`. + let mut iter_clone = *self; + iter_clone.next() + } +} + +impl<'top, D: Decoder, T> RawSequenceIterator<'top, D> for T +where + T: Debug + Copy + Clone + Iterator>>, +{ + // Nothing to do +} + pub trait LazyRawSequence<'top, D: Decoder>: LazyRawContainer<'top, D> + private::LazyContainerPrivate<'top, D> + Debug + Copy + Clone { - type Iterator: Iterator>>; + type Iterator: RawSequenceIterator<'top, D>; fn annotations(&self) -> D::AnnotationsIterator<'top>; fn ion_type(&self) -> IonType; fn iter(&self) -> Self::Iterator; } +pub trait RawStructIterator<'top, D: Decoder>: + Debug + Copy + Clone + Iterator>> +{ + /// Returns the next raw value expression (or `None` if exhausted) without advancing the iterator. + fn peek_next(&self) -> Option>> { + // Because RawStructIterator impls are `Copy`, we can make a cheap copy of `self` and advance + // *it* without affecting `self`. + let mut iter_clone = *self; + iter_clone.next() + } +} + +impl<'top, D: Decoder, T> RawStructIterator<'top, D> for T +where + T: Debug + Copy + Clone + Iterator>>, +{ + // Nothing to do +} + pub trait LazyRawStruct<'top, D: Decoder>: LazyRawContainer<'top, D> + private::LazyContainerPrivate<'top, D> @@ -593,7 +638,7 @@ pub trait LazyRawStruct<'top, D: Decoder>: + Copy + Clone { - type Iterator: Iterator>>; + type Iterator: RawStructIterator<'top, D>; fn annotations(&self) -> D::AnnotationsIterator<'top>; diff --git a/src/lazy/expanded/compiler.rs b/src/lazy/expanded/compiler.rs index 4b53907a..90578f27 100644 --- a/src/lazy/expanded/compiler.rs +++ b/src/lazy/expanded/compiler.rs @@ -4,6 +4,7 @@ use std::ops::Range; use rustc_hash::FxHashMap; +use crate::element::iterators::SymbolsIterator; use crate::lazy::decoder::Decoder; use crate::lazy::expanded::template::{ ExprRange, MacroSignature, Parameter, ParameterCardinality, ParameterEncoding, @@ -17,7 +18,7 @@ use crate::lazy::value::LazyValue; use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; use crate::symbol_ref::AsSymbolRef; -use crate::{v1_1, IonError, IonResult, IonType, Reader, SymbolRef}; +use crate::{v1_1, IonError, IonResult, IonType, Reader, Symbol, SymbolRef}; /// Information inferred about a template's expansion at compile time. #[derive(Copy, Clone, Debug, PartialEq)] @@ -47,10 +48,20 @@ impl ExpansionAnalysis { } } -/// When static analysis can detect that a template body will always expand to a single value, -/// information inferred about that value is stored in this type. When this template backs a -/// lazy value, having these fields available allows the lazy value to answer basic queries without -/// needing to fully evaluate the template. +/// When the [`TemplateCompiler`] is able to determine that a macro's template will always produce +/// exactly one value, that macro is considered a "singleton macro." Singleton macros offer +/// a few benefits: +/// +/// * Because evaluation will produce be exactly one value, the reader can hand out a LazyValue +/// holding the e-expression as its backing data. Other macros cannot do this because if you're +/// holding a LazyValue and the macro later evaluates to 0 values or 100 values, there's not a way +/// for the application to handle those outcomes. +/// * Expanding a singleton macro doesn't require an evaluator with a stack because as soon as +/// you've gotten a value, you're done--no need to `pop()` and preserve state. +/// +/// Information inferred about a singleton macro's output value is stored in an `ExpansionSingleton`. +/// When a singleton macro backs a lazy value, having these fields available allows the lazy value to +/// answer basic queries without needing to fully evaluate the template. #[derive(Copy, Clone, Debug, PartialEq)] pub struct ExpansionSingleton { pub(crate) is_null: bool, @@ -74,6 +85,12 @@ impl ExpansionSingleton { pub fn num_annotations(&self) -> usize { self.num_annotations as usize } + + pub fn annotations<'a>(&self, annotations_storage: &'a [Symbol]) -> SymbolsIterator<'a> { + let annotations_range = 0..self.num_annotations(); + let annotations = &annotations_storage[annotations_range]; + SymbolsIterator::new(annotations) + } } /// Validates a given TDL expression and compiles it into a `TemplateMacro` that can be added diff --git a/src/lazy/expanded/e_expression.rs b/src/lazy/expanded/e_expression.rs index d930bf9d..c32ac411 100644 --- a/src/lazy/expanded/e_expression.rs +++ b/src/lazy/expanded/e_expression.rs @@ -4,6 +4,7 @@ use std::fmt::{Debug, Formatter}; use std::ops::Range; +use crate::element::iterators::SymbolsIterator; use crate::lazy::decoder::{Decoder, RawValueExpr}; use crate::lazy::encoding::TextEncoding_1_1; use crate::lazy::expanded::compiler::{ExpansionAnalysis, ExpansionSingleton}; @@ -150,14 +151,39 @@ impl<'top, D: Decoder> EExpression<'top, D> { self.invoked_macro.expansion_analysis() } + /// Returns `true` if this `EExpression` was statically determined to always return exactly + /// one value. If this method returns `false`, no assertion about the expansion's cardinality + /// is made--the evaluation may still produce one value. + pub fn is_singleton(&self) -> bool { + self.expansion_singleton().is_some() + } + pub fn expansion_singleton(&self) -> Option { self.expansion_analysis().expansion_singleton() } + + /// Returns the `ExpansionSingleton` describing the template expansion information that was + /// inferred from the macro compilation process. + /// /// Caller must guarantee that this e-expression invokes a template and that the template /// has a `ExpansionSingleton`. If these prerequisites are not met, this method will panic. pub fn require_expansion_singleton(&self) -> ExpansionSingleton { self.expansion_singleton().unwrap() } + + /// Returns the annotations that this template expansion will produce, as inferred from the + /// macro compilation process. + /// + /// Caller must guarantee that this e-expression invokes a template and that the template + /// has a `ExpansionSingleton`. If these prerequisites are not met, this method will panic. + pub fn require_singleton_annotations(&self) -> SymbolsIterator<'top> { + let storage = self + .invoked_macro + .require_template() + .body() + .annotations_storage(); + self.expansion_singleton().unwrap().annotations(storage) + } } impl<'top, D: Decoder> Debug for EExpression<'top, D> { diff --git a/src/lazy/expanded/macro_evaluator.rs b/src/lazy/expanded/macro_evaluator.rs index ea8fdfa8..aeb5074b 100644 --- a/src/lazy/expanded/macro_evaluator.rs +++ b/src/lazy/expanded/macro_evaluator.rs @@ -16,7 +16,6 @@ use std::fmt::{Debug, Formatter}; use std::ops::Range; use bumpalo::collections::{String as BumpString, Vec as BumpVec}; -use ice_code::ice; use crate::lazy::decoder::{Decoder, HasSpan, LazyRawValueExpr}; use crate::lazy::expanded::e_expression::{ @@ -34,7 +33,7 @@ use crate::lazy::str_ref::StrRef; use crate::lazy::text::raw::v1_1::arg_group::EExpArg; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; use crate::result::IonFailure; -use crate::{ExpandedValueSource, HasRange, IonError, IonResult, LazyValue, SymbolRef, ValueRef}; +use crate::{ExpandedValueSource, IonError, IonResult, LazyValue, Span, SymbolRef, ValueRef}; pub trait EExpArgGroupIterator<'top, D: Decoder>: Copy + Clone + Debug + Iterator>> @@ -335,15 +334,36 @@ impl<'top, D: Decoder> ValueExpr<'top, D> { } } - /// If this `ValueExpr` represents an entity encoded in te data stream, returns `Some(range)`. + /// Returns `true` if this expression was produced by evaluating a macro. Otherwise, returns `false`. + pub fn is_ephemeral(&self) -> bool { + match self { + ValueExpr::ValueLiteral(value) => value.is_ephemeral(), + ValueExpr::MacroInvocation(invocation) => { + use MacroExprKind::*; + match invocation.kind() { + TemplateMacro(_) => true, + EExp(_) => false, + EExpArgGroup(_) => false, + } + } + } + } + + /// If this `ValueExpr` represents an entity encoded in the data stream, returns `Some(range)`. /// If it represents a template value or a constructed value, returns `None`. pub fn range(&self) -> Option> { + self.span().as_ref().map(Span::range) + } + + /// If this `ValueExpr` represents an entity encoded in the data stream, returns `Some(range)`. + /// If it represents an ephemeral value produced by a macro evaluation, returns `None`. + pub fn span(&self) -> Option> { match self { ValueExpr::ValueLiteral(value) => { use ExpandedValueSource::*; match value.source { - EExp(_) => todo!(), - ValueLiteral(literal) => Some(literal.range()), + SingletonEExp(_) => todo!(), + ValueLiteral(literal) => Some(literal.span()), Template(_, _) => None, Constructed(_, _) => None, } @@ -352,8 +372,8 @@ impl<'top, D: Decoder> ValueExpr<'top, D> { use MacroExprKind::*; match e.source() { TemplateMacro(_) => None, - EExp(e) => Some(e.range()), - EExpArgGroup(g) => Some(g.range()), + EExp(e) => Some(e.span()), + EExpArgGroup(g) => Some(g.span()), } } } @@ -387,16 +407,16 @@ impl<'top, D: Decoder> MacroExpansion<'top, D> { } /// Expands the current macro with the expectation that it will produce exactly one value. + /// For more information about singleton macros, see + /// [`ExpansionSingleton`](crate::lazy::expanded::compiler::ExpansionSingleton). #[inline(always)] pub(crate) fn expand_singleton(mut self) -> IonResult> { // We don't need to construct an evaluator because this is guaranteed to produce exactly // one value. match self.next_step()? { - // If the expansion produces anything other than a final value, there's a bug. MacroExpansionStep::FinalStep(Some(ValueExpr::ValueLiteral(value))) => Ok(value), - _ => ice!(IonResult::decoding_error(format!( - "expansion of {self:?} was required to produce exactly one value", - ))), + // If the expansion produces anything other than a final value, there's a bug. + _ => unreachable!("expansion of {self:?} was required to produce exactly one value"), } } @@ -444,6 +464,12 @@ impl<'top, D: Decoder> MacroExpansion<'top, D> { Void => Ok(MacroExpansionStep::FinalStep(None)), } } + + // Calculate the next step in this macro expansion without advancing the expansion. + pub fn peek_next_step(&self) -> IonResult> { + let mut expansion_copy = *self; + expansion_copy.next_step() + } } impl<'top, D: Decoder> Debug for MacroExpansion<'top, D> { diff --git a/src/lazy/expanded/macro_table.rs b/src/lazy/expanded/macro_table.rs index ff4f1838..f00e4a24 100644 --- a/src/lazy/expanded/macro_table.rs +++ b/src/lazy/expanded/macro_table.rs @@ -131,7 +131,7 @@ impl<'top> MacroRef<'top> { pub fn id_text(&'top self) -> Cow<'top, str> { self.name() .map(Cow::from) - .unwrap_or_else(move || Cow::from(format!("", self.address()))) + .unwrap_or_else(move || Cow::from(format!("{}", self.address()))) } pub fn address(&self) -> MacroAddress { @@ -179,7 +179,7 @@ impl MacroTable { pub const NUM_SYSTEM_MACROS: usize = Self::SYSTEM_MACRO_KINDS.len(); // When a user defines new macros, this is the first ID that will be assigned. This value // is expected to change as development continues. It is currently used in several unit tests. - pub const FIRST_USER_MACRO_ID: usize = 4; + pub const FIRST_USER_MACRO_ID: usize = Self::NUM_SYSTEM_MACROS; pub fn new() -> Self { let macros_by_id = vec![ diff --git a/src/lazy/expanded/mod.rs b/src/lazy/expanded/mod.rs index 289aef6f..cbde1003 100644 --- a/src/lazy/expanded/mod.rs +++ b/src/lazy/expanded/mod.rs @@ -57,6 +57,7 @@ use crate::lazy::expanded::r#struct::LazyExpandedStruct; use crate::lazy::expanded::sequence::Environment; use crate::lazy::expanded::template::{TemplateElement, TemplateMacro, TemplateValue}; use crate::lazy::r#struct::LazyStruct; +use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem}; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::sequence::{LazyList, LazySExp}; use crate::lazy::str_ref::StrRef; @@ -68,8 +69,8 @@ use crate::lazy::value::LazyValue; use crate::raw_symbol_ref::AsRawSymbolRef; use crate::result::IonFailure; use crate::{ - Catalog, Decimal, HasRange, HasSpan, Int, IonResult, IonType, RawSymbolRef, RawVersionMarker, - Span, SymbolRef, SymbolTable, Timestamp, ValueRef, + Catalog, Decimal, HasRange, HasSpan, Int, IonResult, IonType, RawStreamItem, RawSymbolRef, + RawVersionMarker, Span, SymbolRef, SymbolTable, Timestamp, ValueRef, }; // All of these modules (and most of their types) are currently `pub` as the lazy reader is gated @@ -192,32 +193,6 @@ impl<'top> Deref for EncodingContextRef<'top> { } } -#[derive(Debug)] -/// Stream components emitted by a LazyExpandingReader. These items may be encoded directly in the -/// stream, or may have been produced by the evaluation of an encoding expression (e-expression). -pub enum ExpandedStreamItem<'top, D: Decoder> { - /// An Ion Version Marker (IVM) indicating the Ion major and minor version that were used to - /// encode the values that follow. - VersionMarker(u8, u8), - /// An Ion value whose data has not yet been read. For more information about how to read its - /// data and (in the case of containers) access any nested values, see the documentation - /// for [`LazyRawBinaryValue`](crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0). - Value(LazyExpandedValue<'top, D>), - /// The end of the stream - EndOfStream, -} - -impl<'top, D: Decoder> ExpandedStreamItem<'top, D> { - /// Returns an error if this stream item is a version marker or the end of the stream. - /// Otherwise, returns the lazy value it contains. - fn expect_value(&self) -> IonResult<&LazyExpandedValue<'top, D>> { - match self { - ExpandedStreamItem::Value(value) => Ok(value), - _ => IonResult::decoding_error(format!("Expected a value, but found a {:?}", self)), - } - } -} - /// A reader that evaluates macro invocations in the data stream and surfaces the resulting /// raw values to the caller. pub struct ExpandingReader { @@ -509,7 +484,7 @@ impl ExpandingReader { pub fn next_value(&mut self) -> IonResult>> { use SystemStreamItem::*; loop { - match self.next_item() { + match self.next_system_item() { Ok(Value(value)) => return Ok(Some(value)), Ok(EndOfStream(_)) => return Ok(None), Ok(_) => {} @@ -524,9 +499,90 @@ impl ExpandingReader { unsafe { &*self.raw_reader.get() }.encoding() } + /// Returns the next IVM, value, or system value as an `ExpandedStreamItem`. + /// + /// This path is less optimized than `next_system_item` because it needs to surface additional + /// items that do not impact the application. However, is useful for tooling that needs more + /// visibility into the expansion process. + pub fn next_item(&mut self) -> IonResult> { + // If there's already an active macro evaluator, that means the reader is still in the process + // of expanding a macro invocation it previously encountered. See if it has a value to give us. + if let Some(ptr) = self.evaluator_ptr.get() { + // If there's already an evaluator, dereference the pointer. + let evaluator = Self::ptr_to_evaluator(ptr); + match evaluator.next() { + Ok(Some(value)) => { + if evaluator.is_empty() { + // If the evaluator is empty, unset the pointer so we know not to query it + // further. + self.evaluator_ptr.set(None); + } + return Ok(self.interpret_value(value)?.as_expanded_stream_item()); + } + Ok(None) => {} + Err(e) => return Err(e), + } + } + + // Otherwise, we're now between top level expressions. Take this opportunity to apply any + // pending changes to the encoding context and reset state as needed. + self.between_top_level_expressions(); + + // See if the raw reader can get another expression from the input stream. It's possible + // to find an expression that yields no values (for example: `(:void)`), so we perform this + // step in a loop until we get a value or end-of-stream. + let allocator: &BumpAllocator = self.context().allocator(); + let context_ref = EncodingContextRef::new(allocator.alloc_with(|| self.context())); + // Pull another top-level expression from the input stream if one is available. + use crate::lazy::raw_stream_item::RawStreamItem::*; + let raw_reader = unsafe { &mut *self.raw_reader.get() }; + match raw_reader.next(context_ref)? { + VersionMarker(marker) => { + let _system_item = self.interpret_ivm(marker)?; + Ok(ExpandedStreamItem::VersionMarker(marker)) + } + // We got our value; return it. + Value(raw_value) => { + let value = LazyExpandedValue::from_literal(context_ref, raw_value); + Ok(self.interpret_value(value)?.as_expanded_stream_item()) + } + // It's another macro invocation, we'll add it to the evaluator so it will be evaluated + // on the next call and then we'll return the e-expression itself. + EExp(e_exp) => { + let context = self.context(); + let resolved_e_exp = match e_exp.resolve(context_ref) { + Ok(resolved) => resolved, + Err(e) => return Err(e), + }; + + // Get the current evaluator or make a new one + let evaluator = match self.evaluator_ptr.get() { + // If there's already an evaluator in the bump, it's empty. Overwrite it with our new one. + Some(ptr) => { + let bump_evaluator_ref = Self::ptr_to_evaluator(ptr); + bump_evaluator_ref.push(resolved_e_exp.expand()?); + bump_evaluator_ref + } + // If there's not an evaluator in the bump, make a new one. + None => { + let new_evaluator = MacroEvaluator::for_eexp(resolved_e_exp)?; + context.allocator.alloc_with(|| new_evaluator) + } + }; + + // Save the pointer to the evaluator + self.evaluator_ptr + .set(Some(Self::evaluator_to_ptr(evaluator))); + + Ok(ExpandedStreamItem::EExp(resolved_e_exp)) + } + EndOfStream(end_position) => Ok(ExpandedStreamItem::EndOfStream(end_position)), + } + } + /// Returns the next [`SystemStreamItem`] either by continuing to evaluate a macro invocation /// in progress or by pulling another expression from the input stream. - pub fn next_item(&self) -> IonResult> { + pub fn next_system_item(&self) -> IonResult> { // NB: This method takes an immutable reference to `self` but uses `UnsafeCell` to modify // `self` safely. This allows `next_item` to be used in a loop from next_value without // encountering the borrow checker limitations this method skirts. If/when the borrow @@ -635,7 +691,7 @@ pub enum ExpandedValueSource<'top, D: Decoder> { /// This value was a literal in the input stream. ValueLiteral(D::Value<'top>), /// This value is backed by an e-expression invoking a macro known to produce a single value. - EExp(EExpression<'top, D>), + SingletonEExp(EExpression<'top, D>), /// This value was part of a template definition. Template(Environment<'top, D>, TemplateElement<'top>), /// This value was the computed result of a macro invocation like `(:make_string `...)`. @@ -652,16 +708,70 @@ pub enum ExpandedValueSource<'top, D: Decoder> { impl<'top, Encoding: Decoder> Debug for ExpandedValueSource<'top, Encoding> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match &self { - ExpandedValueSource::EExp(eexp) => write!(f, "{eexp:?}"), - ExpandedValueSource::ValueLiteral(v) => write!(f, "{v:?}"), + ExpandedValueSource::SingletonEExp(eexp) => write!(f, "singleton eexp {eexp:?}"), + ExpandedValueSource::ValueLiteral(v) => write!(f, "value literal {v:?}"), ExpandedValueSource::Template(_, template_element) => { - write!(f, "{:?}", template_element.value()) + write!(f, "template {:?}", template_element.value()) } - ExpandedValueSource::Constructed(_, value) => write!(f, "{value:?}"), + ExpandedValueSource::Constructed(_, value) => write!(f, "constructed {value:?}"), } } } +#[derive(Debug, Copy, Clone)] +/// Expanded stream components that a RawReader may encounter. +/// Like `RawStreamItem`, `ExpandedStreamItem` includes both value literals and e-expressions +/// found in the input. +/// Unlike `RawStreamItem`, `ExpandedStreamItem` _also_ includes ephemeral values that were produced +/// by evaluating the e-expressions. Additionally, system values are identified and surfaced as +/// either `SymbolTable`s or `EncodingDirectives`s. +pub enum ExpandedStreamItem<'top, D: Decoder> { + /// An Ion Version Marker (IVM) indicating the Ion major and minor version that were used to + /// encode the values that follow. + VersionMarker(D::VersionMarker<'top>), + /// An Ion 1.1+ macro invocation. Ion 1.0 readers will never return a macro invocation. + EExp(EExpression<'top, D>), + /// An Ion application value. + Value(LazyValue<'top, D>), + /// An annotated Ion struct representing a symbol table. + SymbolTable(LazyStruct<'top, D>), + /// An annotated Ion s-expression representing an encoding directive. + EncodingDirective(LazySExp<'top, D>), + /// The end of the stream + EndOfStream(EndPosition), +} + +impl<'top, D: Decoder> ExpandedStreamItem<'top, D> { + /// Returns `true` if this item was produced by evaluating a macro. Otherwise, returns `false`. + pub fn is_ephemeral(&self) -> bool { + use ExpandedStreamItem::*; + match self { + VersionMarker(_) | EExp(_) | EndOfStream(_) => false, + Value(value) => value.expanded().is_ephemeral(), + SymbolTable(symtab) => symtab.as_value().expanded().is_ephemeral(), + EncodingDirective(directive) => directive.as_value().expanded().is_ephemeral(), + } + } + + /// If this stream item is not ephemeral, returns the `LazyRawStreamItem` backing it. + pub fn raw_item(&self) -> Option> { + use ExpandedStreamItem::*; + let raw_item = match self { + VersionMarker(m) => RawStreamItem::VersionMarker(*m), + EExp(eexp) => RawStreamItem::EExp(eexp.raw_invocation), + Value(v) => return v.raw().map(RawStreamItem::Value), + SymbolTable(symbol_table) => { + return symbol_table.as_value().raw().map(RawStreamItem::Value) + } + EncodingDirective(directive) => { + return directive.as_value().raw().map(RawStreamItem::Value) + } + EndOfStream(position) => RawStreamItem::EndOfStream(*position), + }; + Some(raw_item) + } +} + // Converts the raw value literal types associated with each format decoder (e.g. LazyRawTextValue_1_1) // into an ExpandedValueSource. impl<'top, V: RawValueLiteral, Encoding: Decoder = V>> From @@ -687,15 +797,15 @@ impl<'top> TemplateVariableReference<'top> { } } - fn name(&self) -> &str { + pub fn name(&self) -> &'top str { self.macro_ref.signature().parameters()[self.signature_index()].name() } - fn host_macro(&self) -> &'top Macro { + pub fn host_macro(&self) -> &'top Macro { self.macro_ref } - fn signature_index(&self) -> usize { + pub fn signature_index(&self) -> usize { self.signature_index as usize } } @@ -729,7 +839,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { Some(Self { context: eexp.context, - source: ExpandedValueSource::EExp(eexp), + source: ExpandedValueSource::SingletonEExp(eexp), variable: None, }) } @@ -780,7 +890,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { ValueLiteral(value) => value.ion_type(), Template(_, element) => element.value().ion_type(), Constructed(_annotations, value) => value.ion_type(), - EExp(eexp) => eexp.require_expansion_singleton().ion_type(), + SingletonEExp(eexp) => eexp.require_expansion_singleton().ion_type(), } } @@ -792,7 +902,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { Constructed(_, value) => { matches!(value, ValueRef::Null(_)) } - EExp(eexp) => eexp.require_expansion_singleton().is_null(), + SingletonEExp(eexp) => eexp.require_expansion_singleton().is_null(), } } @@ -802,7 +912,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { ValueLiteral(value) => value.has_annotations(), Template(_, element) => !element.annotations().is_empty(), Constructed(annotations, _) => !annotations.is_empty(), - EExp(eexp) => eexp.require_expansion_singleton().has_annotations(), + SingletonEExp(eexp) => eexp.require_expansion_singleton().has_annotations(), } } @@ -821,17 +931,9 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { annotations.iter(), )) } - EExp(eexp) => { - let annotations_range = 0..eexp.require_expansion_singleton().num_annotations(); - let annotations = &eexp - .invoked_macro - .require_template() - .body() - .annotations_storage()[annotations_range]; - ExpandedAnnotationsIterator::new(ExpandedAnnotationsSource::Template( - SymbolsIterator::new(annotations), - )) - } + SingletonEExp(eexp) => ExpandedAnnotationsIterator::new( + ExpandedAnnotationsSource::Template(eexp.require_singleton_annotations()), + ), } } @@ -846,7 +948,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { element, )), Constructed(_annotations, value) => Ok((**value).as_expanded()), - EExp(ref eexp) => eexp.expand_to_single_value()?.read(), + SingletonEExp(ref eexp) => eexp.expand_to_single_value()?.read(), } } @@ -859,7 +961,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { Ok(ValueRef::from_template(self.context, *environment, element)) } Constructed(_annotations, value) => Ok(**value), - EExp(ref eexp) => self.read_resolved_singleton_eexp(eexp), + SingletonEExp(ref eexp) => self.read_resolved_singleton_eexp(eexp), } } @@ -891,6 +993,20 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { IonResult::decoding_error("expected LazyExpandedValue to be a literal") } + /// Returns `true` if this value was produced by evaluating a macro. Otherwise, returns `false`. + pub fn is_ephemeral(&self) -> bool { + !matches!(&self.source, ExpandedValueSource::ValueLiteral(_)) || self.is_parameter() + } + + /// Returns `true` if this value was an argument passed into a macro. + pub fn is_parameter(&self) -> bool { + self.variable.is_some() + } + + pub fn variable(&self) -> Option> { + self.variable + } + pub fn range(&self) -> Option> { if let ExpandedValueSource::ValueLiteral(value) = &self.source { return Some(value.range()); diff --git a/src/lazy/expanded/template.rs b/src/lazy/expanded/template.rs index 04272525..48f12210 100644 --- a/src/lazy/expanded/template.rs +++ b/src/lazy/expanded/template.rs @@ -962,9 +962,15 @@ impl<'top, D: Decoder> Iterator for TemplateMacroInvocationArgsIterator<'top, D> )) } TemplateBodyExprKind::Variable(variable_ref) => { - self + let mut expr = self .environment - .require_expr(variable_ref.signature_index()) + .require_expr(variable_ref.signature_index()); + // If this is a value (and therefore needs no further evaluation), tag it as having + // come from this variable in the template body. + if let ValueExpr::ValueLiteral(ref mut value) = expr { + *value = value.via_variable(variable_ref.resolve(self.host_template.reference())) + } + expr }, TemplateBodyExprKind::MacroInvocation(body_invocation) => { let invocation = body_invocation diff --git a/src/lazy/raw_stream_item.rs b/src/lazy/raw_stream_item.rs index a55d6a56..8f7bef92 100644 --- a/src/lazy/raw_stream_item.rs +++ b/src/lazy/raw_stream_item.rs @@ -5,9 +5,9 @@ use crate::{AnyEncoding, IonEncoding, IonError, IonResult}; use std::fmt::Debug; use std::ops::Range; -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] /// Raw stream components that a RawReader may encounter. -pub enum RawStreamItem { +pub enum RawStreamItem { /// An Ion Version Marker (IVM) indicating the Ion major and minor version that were used to /// encode the values that follow. VersionMarker(M), @@ -38,8 +38,11 @@ impl<'top> LazyRawStreamItem<'top, AnyEncoding> { } } -impl HasRange - for RawStreamItem +impl< + M: Debug + Copy + Clone + HasRange, + V: Debug + Copy + Clone + HasRange, + E: Debug + Copy + Clone + HasRange, + > HasRange for RawStreamItem { fn range(&self) -> Range { use RawStreamItem::*; @@ -52,8 +55,12 @@ impl HasRange } } -impl<'top, M: Debug + HasSpan<'top>, V: Debug + HasSpan<'top>, E: Debug + HasSpan<'top>> - HasSpan<'top> for RawStreamItem +impl< + 'top, + M: Debug + Copy + Clone + HasSpan<'top>, + V: Debug + Copy + Clone + HasSpan<'top>, + E: Debug + Copy + Clone + HasSpan<'top>, + > HasSpan<'top> for RawStreamItem { fn span(&self) -> Span<'top> { use RawStreamItem::*; diff --git a/src/lazy/span.rs b/src/lazy/span.rs index bb5d41fb..42126b51 100644 --- a/src/lazy/span.rs +++ b/src/lazy/span.rs @@ -14,6 +14,18 @@ pub struct Span<'a> { offset: usize, } +impl<'a> AsRef<[u8]> for Span<'a> { + fn as_ref(&self) -> &[u8] { + self.bytes() + } +} + +impl<'a> From> for &'a [u8] { + fn from(value: Span<'a>) -> Self { + value.bytes + } +} + impl<'a, A: AsRef<[u8]>> PartialEq for Span<'a> { fn eq(&self, other: &A) -> bool { self.bytes() == other.as_ref() @@ -41,4 +53,12 @@ impl<'a> Span<'a> { std::str::from_utf8(self.bytes) .map_err(|_| IonError::decoding_error("span text was not valid UTF-8")) } + + pub fn len(&self) -> usize { + self.bytes.len() + } + + pub fn is_empty(&self) -> bool { + self.bytes.is_empty() + } } diff --git a/src/lazy/streaming_raw_reader.rs b/src/lazy/streaming_raw_reader.rs index 4bd632be..67016628 100644 --- a/src/lazy/streaming_raw_reader.rs +++ b/src/lazy/streaming_raw_reader.rs @@ -45,6 +45,34 @@ pub struct StreamingRawReader { const DEFAULT_IO_BUFFER_SIZE: usize = 4 * 1024; +pub struct RawReaderState<'a> { + data: &'a [u8], + offset: usize, + encoding: IonEncoding, +} + +impl<'a> RawReaderState<'a> { + pub fn new(data: &'a [u8], offset: usize, encoding: IonEncoding) -> Self { + Self { + data, + offset, + encoding, + } + } + + pub fn data(&self) -> &'a [u8] { + self.data + } + + pub fn offset(&self) -> usize { + self.offset + } + + pub fn encoding(&self) -> IonEncoding { + self.encoding + } +} + impl StreamingRawReader { pub fn new(_encoding: Encoding, input: Input) -> StreamingRawReader { StreamingRawReader { @@ -79,6 +107,21 @@ impl StreamingRawReader { pub fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, + ) -> IonResult> { + self.read_next(context, /*is_peek=*/ false) + } + + pub fn peek_next<'top>( + &'top mut self, + context: EncodingContextRef<'top>, + ) -> IonResult> { + self.read_next(context, /*is_peek=*/ true) + } + + fn read_next<'top>( + &'top mut self, + context: EncodingContextRef<'top>, + is_peek: bool, ) -> IonResult> { let mut input_source_exhausted = false; loop { @@ -174,13 +217,16 @@ impl StreamingRawReader { } } - // Mark those input bytes as having been consumed so they are not read again. - input.consume(bytes_read); - // Update the streaming reader's position to reflect the number of bytes we - // just read. - self.stream_position = end_position; - // If the item read was an IVM, this will be a new value. - self.detected_encoding = new_encoding; + // If this isn't just a peek, update our state to remember what we've already read. + if !is_peek { + // Mark those input bytes as having been consumed so they are not read again. + input.consume(bytes_read); + // Update the streaming reader's position to reflect the number of bytes we + // just read. + self.stream_position = end_position; + // If the item read was an IVM, this will be a new value. + self.detected_encoding = new_encoding; + } } return result; diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index 21673ffc..d106e18f 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -1,8 +1,5 @@ #![allow(non_camel_case_types)] -use std::fmt; -use std::fmt::{Debug, Formatter}; - use crate::element::builders::StructBuilder; use crate::lazy::decoder::{Decoder, LazyRawContainer}; use crate::lazy::encoding::BinaryEncoding_1_0; @@ -14,6 +11,8 @@ use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; use crate::{Annotations, Element, IntoAnnotatedElement, IonError, IonResult, Struct, SymbolRef}; +use std::fmt; +use std::fmt::{Debug, Formatter}; /// An as-of-yet unread binary Ion struct. `LazyStruct` is immutable; its fields and annotations /// can be read any number of times. @@ -299,6 +298,39 @@ impl<'top, D: Decoder> LazyField<'top, D> { expanded_value: self.expanded_field.value(), } } + + #[cfg(feature = "experimental-tooling-apis")] + pub fn raw_name(&self) -> Option> { + if let crate::LazyExpandedFieldName::RawName(_context, raw_name) = + self.expanded_field.name() + { + Some(raw_name) + } else { + None + } + } + + #[cfg(feature = "experimental-tooling-apis")] + pub fn raw_value(&self) -> Option> { + if let crate::ExpandedValueSource::ValueLiteral(literal) = + self.expanded_field.value().source() + { + Some(literal) + } else { + None + } + } + + #[cfg(feature = "experimental-tooling-apis")] + pub fn range(&self) -> Option> { + use crate::HasRange; + match (self.raw_name(), self.raw_value()) { + (Some(raw_name), Some(raw_value)) => { + Some(raw_name.range().start..raw_value.range().end) + } + _ => None, + } + } } pub struct StructIterator<'top, D: Decoder> { diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index 69840372..67b060c5 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -6,7 +6,7 @@ use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::encoding_module::EncodingModule; use crate::lazy::expanded::macro_table::MacroTable; use crate::lazy::expanded::template::TemplateMacro; -use crate::lazy::expanded::{ExpandingReader, LazyExpandedValue}; +use crate::lazy::expanded::{ExpandedStreamItem, ExpandingReader, LazyExpandedValue}; use crate::lazy::sequence::SExpIterator; use crate::lazy::streaming_raw_reader::{IonInput, StreamingRawReader}; use crate::lazy::system_stream_item::SystemStreamItem; @@ -195,12 +195,21 @@ impl SystemReader { self.expanding_reader.pending_context_changes() } - /// Returns the next top-level stream item (IVM, Symbol Table, Value, or Nothing) as a - /// [`SystemStreamItem`]. - pub fn next_item(&mut self) -> IonResult> { + /// Returns the next top-level stream item (IVM, symbol table, encoding directive, Value, or nothing) + /// as an [`ExpandedStreamItem`]. + /// + /// This method exists largely for tooling; most applications will want to + /// use [`next_item`](Self::next_item). + pub fn next_expanded_item(&mut self) -> IonResult> { self.expanding_reader.next_item() } + /// Returns the next top-level stream item (IVM, symbol table, encoding directive, Value, or nothing) + /// as a [`SystemStreamItem`]. + pub fn next_item(&mut self) -> IonResult> { + self.expanding_reader.next_system_item() + } + /// Returns the next value that is part of the application data model, bypassing all encoding /// artifacts (IVMs, symbol tables). pub fn next_value(&mut self) -> IonResult>> { @@ -1077,7 +1086,7 @@ mod tests { &[ Symbol::from("foo"), Symbol::from("bar"), - Symbol::from("baz") + Symbol::from("baz"), ] ); @@ -1087,11 +1096,11 @@ mod tests { // This directive defines two more. assert_eq!(new_macro_table.len(), 2 + MacroTable::NUM_SYSTEM_MACROS); assert_eq!( - new_macro_table.macro_with_id(4), + new_macro_table.macro_with_id(MacroTable::FIRST_USER_MACRO_ID), new_macro_table.macro_with_name("seventeen") ); assert_eq!( - new_macro_table.macro_with_id(5), + new_macro_table.macro_with_id(MacroTable::FIRST_USER_MACRO_ID + 1), new_macro_table.macro_with_name("twelve") ); diff --git a/src/lazy/system_stream_item.rs b/src/lazy/system_stream_item.rs index c1fb4587..f42b1078 100644 --- a/src/lazy/system_stream_item.rs +++ b/src/lazy/system_stream_item.rs @@ -5,7 +5,7 @@ use crate::lazy::r#struct::LazyStruct; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::value::LazyValue; use crate::result::IonFailure; -use crate::{IonError, IonResult, LazySExp}; +use crate::{ExpandedStreamItem, IonError, IonResult, LazySExp}; /// System stream elements that a SystemReader may encounter. #[non_exhaustive] @@ -23,6 +23,20 @@ pub enum SystemStreamItem<'top, D: Decoder> { EndOfStream(EndPosition), } +impl<'top, D: Decoder> SystemStreamItem<'top, D> { + /// Returns an [`ExpandedStreamItem`] view of this item. + pub fn as_expanded_stream_item(&self) -> ExpandedStreamItem<'top, D> { + use SystemStreamItem::*; + match self { + VersionMarker(m) => ExpandedStreamItem::VersionMarker(*m), + SymbolTable(s) => ExpandedStreamItem::SymbolTable(*s), + EncodingDirective(d) => ExpandedStreamItem::EncodingDirective(*d), + Value(v) => ExpandedStreamItem::Value(*v), + EndOfStream(e) => ExpandedStreamItem::EndOfStream(*e), + } + } +} + impl<'top, D: Decoder> Debug for SystemStreamItem<'top, D> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index ada9c4b7..9a9e0927 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -11,7 +11,8 @@ use nom::bytes::complete::{ }; use nom::bytes::streaming::{is_a, tag, take_until, take_while_m_n}; use nom::character::complete::{ - char as complete_char, digit1 as complete_digit1, one_of as complete_one_of, + char as complete_char, digit0 as complete_digit0, digit1 as complete_digit1, + one_of as complete_one_of, }; use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy}; use nom::combinator::{consumed, eof, map, not, opt, peek, recognize, success, value}; @@ -1088,6 +1089,11 @@ impl<'top> TextBufferView<'top> { // TODO: Support macro ID kinds besides unqualified names let (exp_body_after_id, (macro_id_bytes, matched_symbol)) = consumed(Self::match_identifier)(eexp_body)?; + if exp_body_after_id.is_empty() { + // Unlike a symbol value with identifier syntax, an e-expression identifier cannot be + // the last thing in the stream. + return Err(nom::Err::Incomplete(Needed::Unknown)); + } let id = match matched_symbol .read(self.context.allocator(), macro_id_bytes) @@ -1554,17 +1560,44 @@ impl<'top> TextBufferView<'top> { fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'top> { recognize(preceded( complete_tag("."), - opt(Self::match_digits_after_dot), + opt(Self::match_zero_or_more_digits_after_dot), ))(self) } /// Like `match_digits_before_dot`, but allows leading zeros. - fn match_digits_after_dot(self) -> IonMatchResult<'top> { + fn match_one_or_more_digits_after_dot(self) -> IonMatchResult<'top> { recognize(terminated( - // Zero or more digits-followed-by-underscores + // Any number of digit-sequence-with-trailing-underscores... many0_count(pair(complete_digit1, complete_char('_'))), - // One or more digits - complete_digit1, + // ...and at least one trailing digit. Inputs that don't have any underscores + // will be handled by this parser branch. + pair(satisfy(|c| c.is_ascii_digit()), complete_digit0), + // Note: ^-- We use this `pair(satisfy(...), complete_digit0)` to guarantee a subtle + // behavior. At the end of the buffer, an empty input to this parser must be + // considered 'incomplete' instead of 'invalid'. In contrast, an input of a single + // digit would be considered complete even though the buffer could get more data later. + // (If the buffer gets more data, it's the StreamingRawReader's responsibility to + // discard the `1.1` and try again.) + ))(self) + } + + /// Like `match_digits_before_dot`, but allows leading zeros. + fn match_zero_or_more_digits_after_dot(self) -> IonMatchResult<'top> { + recognize(terminated( + // Zero or more digits-followed-by-underscores. + many0_count(pair( + complete_digit1, + terminated( + // The digit sequence can be followed by an underscore... + complete_char('_'), + // ...as long as the character after the underscore is another digit. + peek(satisfy(|c| c.is_ascii_digit())), + ), + )), + // ...and zero or more trailing digits. This parser branch handles: + // * inputs that don't have any underscores + // * empty inputs + complete_digit0, ))(self) } @@ -1573,7 +1606,7 @@ impl<'top> TextBufferView<'top> { fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'top> { preceded( complete_one_of("eE"), - recognize(Self::match_exponent_sign_and_digits), + recognize(Self::match_exponent_sign_and_one_or_more_digits), )(self) } @@ -1586,23 +1619,26 @@ impl<'top> TextBufferView<'top> { /// /// Returns a boolean indicating whether the sign was negative (vs absent or positive) /// and the buffer slice containing the digits. - fn match_exponent_sign_and_digits(self) -> IonParseResult<'top, (bool, Self)> { + fn match_exponent_sign_and_one_or_more_digits(self) -> IonParseResult<'top, (bool, Self)> { pair( // Optional leading sign; if there's no sign, it's not negative. opt(Self::match_any_sign).map(|s| s == Some('-')), - Self::match_digits_after_dot, + Self::match_one_or_more_digits_after_dot, )(self) } /// Matches `-` OR `+`. /// /// This is used for matching exponent signs; most places in Ion do not allow `+`. - pub fn match_any_sign(self) -> IonParseResult<'top, char> { + pub fn match_any_sign(self) -> IonParseResult<'top, std::primitive::char> { complete_one_of("-+")(self) } pub fn match_decimal_exponent(self) -> IonParseResult<'top, (bool, TextBufferView<'top>)> { - preceded(complete_one_of("dD"), Self::match_exponent_sign_and_digits)(self) + preceded( + complete_one_of("dD"), + Self::match_exponent_sign_and_one_or_more_digits, + )(self) } /// Match an optional sign (if present), digits before the decimal point, then digits after the @@ -1613,10 +1649,9 @@ impl<'top> TextBufferView<'top> { opt(complete_tag("-")), Self::match_digits_before_dot, alt(( - // Either a decimal point and digits and optional d/D and exponent tuple(( complete_tag("."), - opt(Self::match_digits_after_dot), + opt(Self::match_zero_or_more_digits_after_dot), opt(Self::match_decimal_exponent), )) .map(|(dot, maybe_digits_after_dot, maybe_exponent)| { @@ -1956,7 +1991,7 @@ impl<'top> TextBufferView<'top> { } /// Matches a single base-10 digit, 0-9. - fn match_any_digit(self) -> IonParseResult<'top, char> { + fn match_any_digit(self) -> IonParseResult<'top, std::primitive::char> { satisfy(|c| c.is_ascii_digit())(self) } @@ -2747,13 +2782,15 @@ mod tests { ], expect_mismatch: [ "305", // Integer - "305e", // Has exponent delimiter but no exponent ".305e", // No digits before the decimal point "305e0.5", // Fractional exponent "305e-0.5", // Negative fractional exponent "0305e1", // Leading zero "+305e1", // Leading plus sign "--305e1", // Multiple negative signs + ], + expect_incomplete: [ + "305e", // Has exponent delimiter but no exponent ] } @@ -2878,8 +2915,17 @@ mod tests { "5.0d+1", "-5.0d-1", ], expect_mismatch: [ - "123._456", "5", "5d", "05d", "-5d", "5.d", "-5.d", "5.D", "-5.D", "5.1d", "-5.1d", - "5.1D", "-5.1D", "-5.0+0", + "123._456", "5", "05d", "-5.0+0", + ], + expect_incomplete: [ + "5d", + "-5d", + "5.d", + "-5.d", + "5.D", + "-5.D", + "5.1d", "-5.1d", + "5.1D", "-5.1D", ] } diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index 0d13479c..c67de352 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -129,21 +129,57 @@ impl<'data> From> for IonError { .unwrap_or("invalid Ion syntax encountered"), ); if let Some(label) = invalid_input_error.label { - message.push_str(" while "); + message.push_str("\n while "); message.push_str(label.as_ref()); } - message.push_str("; buffer: "); + use std::fmt::Write; let input = invalid_input_error.input; - let buffer_text = if let Ok(text) = invalid_input_error.input.as_text() { - text.chars().take(32).collect::() - } else { - format!( - "{:X?}", - &invalid_input_error.input.bytes()[..(32.min(input.len()))] - ) + + // Make displayable strings showing the contents of the first and last 32 characters + // of the buffer. If the buffer is smaller than 32 bytes, fewer characters will be shown. + const NUM_CHARS_TO_SHOW: usize = 32; + let (buffer_head, buffer_tail) = match input.as_text() { + // The buffer contains UTF-8 bytes, so we'll display it as text + Ok(text) => { + let head = text.chars().take(NUM_CHARS_TO_SHOW).collect::(); + let tail_backwards = text + .chars() + .rev() + .take(NUM_CHARS_TO_SHOW) + .collect::>(); + let tail = tail_backwards.iter().rev().collect::(); + (head, tail) + } + // The buffer contains non-text bytes, so we'll show its contents as formatted hex + // pairs instead. + Err(_) => { + let head = format!( + "{:X?}", + &invalid_input_error.input.bytes()[..NUM_CHARS_TO_SHOW.min(input.len())] + ); + let tail_bytes_to_take = input.bytes().len().min(NUM_CHARS_TO_SHOW); + let buffer_tail = &input.bytes()[input.len() - tail_bytes_to_take..]; + let tail = format!("{:X?}", buffer_tail); + (head, tail) + } }; - message.push_str(buffer_text.as_str()); - message.push_str("..."); + // The offset and buffer head will often be helpful to the programmer. The buffer tail + // and buffer length will be helpful to library maintainers receiving copy/pasted + // error messages to debug. + write!( + message, + r#" + offset={} + buffer head=<{}...> + buffer tail=<...{}> + buffer len={} + "#, + invalid_input_error.input.offset(), + buffer_head, + buffer_tail, + input.len(), + ) + .unwrap(); let position = Position::with_offset(invalid_input_error.input.offset()) .with_length(invalid_input_error.input.len()); let decoding_error = DecodingError::new(message).with_position(position); diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index a7430b88..29bd6a9d 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -5,6 +5,7 @@ use crate::lazy::decoder::LazyRawReader; use crate::lazy::encoding::TextEncoding_1_0; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; +use crate::lazy::streaming_raw_reader::RawReaderState; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::AddContext; use crate::{Encoding, IonResult}; @@ -92,8 +93,8 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_0> for LazyRawTextReader_1_0<'da LazyRawTextReader_1_0::new_with_offset(data, offset) } - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { - ( + fn save_state(&self) -> RawReaderState<'data> { + RawReaderState::new( &self.input[self.local_offset..], self.position(), self.encoding(), diff --git a/src/lazy/text/raw/v1_1/reader.rs b/src/lazy/text/raw/v1_1/reader.rs index 0158ba15..cbe048c4 100644 --- a/src/lazy/text/raw/v1_1/reader.rs +++ b/src/lazy/text/raw/v1_1/reader.rs @@ -18,6 +18,7 @@ use crate::lazy::expanded::macro_evaluator::RawEExpression; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::span::Span; +use crate::lazy::streaming_raw_reader::RawReaderState; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::matched::{MatchedFieldName, MatchedValue}; use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; @@ -50,8 +51,8 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'da } } - fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { - ( + fn save_state(&self) -> RawReaderState<'data> { + RawReaderState::new( &self.input[self.local_offset..], self.position(), self.encoding(), diff --git a/src/lazy/value.rs b/src/lazy/value.rs index 1b44e10f..055fffe4 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -52,7 +52,7 @@ use crate::{ ///# #[cfg(not(feature = "experimental-reader-writer"))] ///# fn main() -> IonResult<()> { Ok(()) } /// ``` -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] pub struct LazyValue<'top, D: Decoder> { pub(crate) expanded_value: LazyExpandedValue<'top, D>, } diff --git a/src/lib.rs b/src/lib.rs index bf56eef4..1d408d09 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,7 +288,7 @@ macro_rules! v1_x_tooling_apis { }, lazy::expanded::e_expression::{EExpression, EExpressionArgsIterator}, lazy::expanded::sequence::{Environment, ExpandedListSource, ExpandedSExpSource, LazyExpandedList, LazyExpandedSExp}, - lazy::expanded::{LazyExpandedValue, ExpandingReader, ExpandedValueSource, ExpandedAnnotationsSource, ExpandedValueRef}, + lazy::expanded::{ExpandedStreamItem, LazyExpandedValue, ExpandingReader, ExpandedValueSource, ExpandedAnnotationsSource, ExpandedValueRef}, lazy::system_stream_item::SystemStreamItem, lazy::system_reader::{SystemReader}, }; @@ -313,6 +313,7 @@ macro_rules! v1_0_tooling_apis { }, lazy::binary::raw::r#struct::{LazyRawBinaryStruct_1_0 as LazyRawBinaryStruct, LazyRawBinaryFieldName_1_0 as LazyRawBinaryFieldName}, lazy::binary::raw::value::{ + EncodedBinaryValue, LazyRawBinaryValue_1_0 as LazyRawBinaryValue, LazyRawBinaryVersionMarker_1_0 as LazyRawBinaryVersionMarker, EncodedBinaryValueData_1_0 as EncodedBinaryValueData, @@ -330,6 +331,15 @@ macro_rules! v1_1_tooling_apis { lazy::encoder::binary::v1_1::flex_uint::FlexUInt, lazy::encoder::binary::v1_1::writer::LazyRawBinaryWriter_1_1 as RawBinaryWriter, lazy::encoder::text::v1_1::writer::LazyRawTextWriter_1_1 as RawTextWriter, + lazy::binary::raw::v1_1::sequence::{ + LazyRawBinaryList_1_1 as LazyRawBinaryList, + LazyRawBinarySExp_1_1 as LazyRawBinarySExp + }, + lazy::binary::raw::v1_1::r#struct::{LazyRawBinaryStruct_1_1 as LazyRawBinaryStruct, LazyRawBinaryFieldName_1_1 as LazyRawBinaryFieldName}, + lazy::binary::raw::v1_1::value::{ + LazyRawBinaryValue_1_1 as LazyRawBinaryValue, + LazyRawBinaryVersionMarker_1_1 as LazyRawBinaryVersionMarker, + }, }; }; }