diff --git a/examples/print-content.rs b/examples/print-content.rs index 09d3bd6..c20021a 100644 --- a/examples/print-content.rs +++ b/examples/print-content.rs @@ -2,13 +2,88 @@ extern crate dbase; fn main() { let dbf_path = std::env::args().nth(1).expect("Path to file as first arg"); - let mut reader = dbase::Reader::from_path(dbf_path).unwrap(); + let mut reader = dbase::Reader::from_path(&dbf_path).unwrap(); + println!("{}", reader.header().size_of_record); + // let mut records = reader.iter_records().collect::,_>>().unwrap(); + // + // + // let r = records.clone(); + // + // while records.len() < 500_000 { + // let mut tmp = r.clone(); + // records.append(&mut tmp); + // } + // + // let mut writer = dbase::TableWriterBuilder::from_reader(reader).build_with_file_dest("lol.dbf").unwrap(); + // writer.write_records(&records).unwrap(); - for (i, record_result) in reader.iter_records().enumerate() { - println!("Record {}", i); - let record = record_result.unwrap(); - for (name, value) in record { - println!("\tname: {}, value: {:?}", name, value); + let t = std::time::Instant::now(); + let mut records = Vec::with_capacity(reader.header().num_records as usize); + for record in reader.iter_records() { + // let tt = std::time::Instant::now(); + records.push(record.unwrap()); + // println!("time to read one record: {:?}", tt.elapsed()); + } + println!("Time to read via reader: {:?}", t.elapsed()); + + let mut file = dbase::File::open_read_only(&dbf_path).unwrap(); + let t = std::time::Instant::now(); + let mut records = Vec::with_capacity(file.num_records() * file.fields().len()); + let num_fields = file.fields().len(); + let num_records = file.num_records(); + let mut iter = file.records(); + // let mut c = 0u64; + loop { + // let tt = std::time::Instant::now(); + let Some(mut record) = iter.next() else { + break; + }; + // println!("time to read one record: {:?}", tt.elapsed()); + + for i in 0..2 { + records.push(record.field(dbase::FieldIndex(i)).unwrap().read()); } } + println!("Time to read via file: {:?}", t.elapsed()); + + let mut file = dbase::File::open_read_only(dbf_path).unwrap(); + let t = std::time::Instant::now(); + let mut records = Vec::with_capacity(file.num_records()); + let mut iter = file.records(); + while let Some(mut record) = iter.next() { + records.push(record.read().unwrap()); + } + println!("Time to read via file: {:?}", t.elapsed()); + + // for (i, record_result) in reader.iter_records().enumerate() { + // println!("Record {}", i); + // let record = record_result.unwrap(); + // for (name, value) in record { + // println!("\tname: {}, value: {:?}", name, value); + // } + // } } + +// +// use std::fs::File; +// use std::io::{BufReader, SeekFrom}; +// use std::io::prelude::*; +// use byteorder::WriteBytesExt; +// +// fn main() -> std::io::Result<()> { +// let mut file = File::options().read(true).write(true).truncate(false).open("foo.txt")?; +// let mut file_copy = BufReader::new(file.try_clone()?); +// +// let mut contents = vec![0u8; 3]; +// file_copy.read_exact(&mut contents)?; +// println!("contents: {:?}", contents); +// +// // file.seek(SeekFrom::Start(2)).unwrap(); +// file.write_u8(33).unwrap(); +// +// // file_copy.seek(SeekFrom::Start(0)).unwrap(); +// file_copy.read_exact(&mut contents).unwrap(); +// println!("contents: {:?}", contents); +// +// Ok(()) +// } diff --git a/src/field/mod.rs b/src/field/mod.rs index f0ce300..41830b8 100644 --- a/src/field/mod.rs +++ b/src/field/mod.rs @@ -1,5 +1,7 @@ use std::convert::TryFrom; use std::io::{Read, Write}; +use std::ops::Index; +use std::slice::SliceIndex; use byteorder::{ReadBytesExt, WriteBytesExt}; @@ -155,6 +157,67 @@ impl FieldInfo { } } +pub struct FieldsInfo { + pub(crate) inner: Vec, +} + +impl FieldsInfo { + pub(crate) fn read_from(source: &mut R, num_fields: usize) -> Result { + let mut fields_info = Vec::::with_capacity(num_fields as usize); + for _ in 0..num_fields { + let info = FieldInfo::read_from(source)?; + fields_info.push(info); + } + + Ok(Self { inner: fields_info }) + } + + // TODO FieldIndex ? + pub(crate) fn field_position_in_record(&self, index: usize) -> Option { + self.inner + .get(..index) + .map(|slc| slc.iter().map(|i| i.field_length as usize).sum::()) + } + + pub(crate) fn size_of_all_fields(&self) -> usize { + self.inner + .iter() + .map(|i| i.field_length as usize) + .sum::() + } + + pub(crate) fn at_least_one_field_is_memo(&self) -> bool { + self.inner + .iter() + .any(|f_info| f_info.field_type == FieldType::Memo) + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn iter(&self) -> std::slice::Iter<'_, FieldInfo> { + self.inner.iter() + } +} + +impl AsRef<[FieldInfo]> for FieldsInfo { + fn as_ref(&self) -> &[FieldInfo] { + &self.inner + } +} + +impl Index for FieldsInfo +where + I: SliceIndex<[FieldInfo]>, +{ + type Output = I::Output; + + fn index(&self, index: I) -> &Self::Output { + &self.inner.as_slice()[index] + } +} + impl std::fmt::Display for FieldInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -172,21 +235,28 @@ pub(crate) enum DeletionFlag { } impl DeletionFlag { - pub(crate) fn read_from(source: &mut T) -> std::io::Result { - let byte = source.read_u8()?; + pub(crate) const fn to_byte(self) -> u8 { + match self { + Self::NotDeleted => 0x20, + Self::Deleted => 0x2A, + } + } + + pub(crate) const fn from_byte(byte: u8) -> Self { match byte { - 0x20 => Ok(Self::NotDeleted), - 0x2A => Ok(Self::Deleted), + 0x20 => Self::NotDeleted, + 0x2A => Self::Deleted, // Silently consider other values as not deleted - _ => Ok(Self::NotDeleted), + _ => Self::NotDeleted, } } + pub(crate) fn read_from(source: &mut T) -> std::io::Result { + source.read_u8().map(Self::from_byte) + } + pub(crate) fn write_to(self, dst: &mut T) -> std::io::Result<()> { - match self { - Self::NotDeleted => dst.write_u8(0x20), - Self::Deleted => dst.write_u8(0x2A), - } + dst.write_u8(self.to_byte()) } } /// Flags describing a field diff --git a/src/file.rs b/src/file.rs index 30444c1..7994e58 100644 --- a/src/file.rs +++ b/src/file.rs @@ -1,7 +1,8 @@ use crate::encoding::DynEncoding; use crate::field::types::TrimOption; -use crate::field::{DeletionFlag, DELETION_FLAG_SIZE}; +use crate::field::{DeletionFlag, FieldsInfo, DELETION_FLAG_SIZE}; use crate::header::Header; +use crate::memo::MemoReader; use crate::reading::{ReadingOptions, BACKLINK_SIZE, TERMINATOR_VALUE}; use crate::writing::{write_header_parts, WritableAsDbaseField}; use crate::ErrorKind::UnsupportedCodePage; @@ -85,12 +86,20 @@ impl<'a, T> FieldRef<'a, T> { .header .record_position(self.record_index.0) .unwrap() as u64; - let position_in_record = self.file.fields_info[..self.field_index.0] - .iter() - .map(|i| i.field_length as u64) - .sum::(); + let position_in_record = self.position_in_record(); + + record_position + position_in_record as u64 + } - record_position + position_in_record + /// Returns the start position in the record **INCLUDING** + /// the deletion flag + fn position_in_record(&self) -> usize { + DELETION_FLAG_SIZE + + self + .file + .fields_info + .field_position_in_record(self.field_index.0) + .expect("internal error, invalid field index in FieldRef") } } @@ -98,7 +107,7 @@ impl<'a, T> FieldRef<'a, T> where T: Seek, { - pub(crate) fn seek_to_beginning(&mut self) -> Result { + fn seek_to_beginning(&mut self) -> Result { let field_info = &self.file.fields_info[self.field_index.0]; self.file @@ -114,22 +123,15 @@ where { /// Reads and returns the value pub fn read(&mut self) -> Result { - self.seek_to_beginning() - .map_err(|e| Error::new(e, self.record_index.0))?; - let field_info = &self.file.fields_info[self.field_index.0]; - let buffer = &mut self.file.field_data_buffer[..field_info.field_length as usize]; - self.file.inner.read(buffer).map_err(|e| { - Error::new( - FieldIOError::new(ErrorKind::IoError(e), Some(field_info.clone())), - self.record_index.0, - ) - })?; + let start_pos = self.position_in_record(); + let field_bytes = &mut self.file.record_data_buffer.get_mut() + [start_pos..start_pos + field_info.field_length as usize]; - FieldValue::read_from::>, _>( - &buffer, - &mut None, + FieldValue::read_from( + &field_bytes, + &mut self.file.memo_reader, field_info, &self.file.encoding, TrimOption::BeginEnd, @@ -170,14 +172,21 @@ where where ValueType: WritableAsDbaseField, { - self.seek_to_beginning() + self.file.file_position = self + .seek_to_beginning() .map_err(|e| Error::new(e, self.record_index.0))?; let field_info = &self.file.fields_info[self.field_index.0]; - let buffer = &mut self.file.field_data_buffer[..field_info.field_length as usize]; - buffer.fill(0); - let mut cursor = Cursor::new(buffer); + let start_pos = self.position_in_record(); + let field_bytes = &mut self.file.record_data_buffer.get_mut() + [start_pos..start_pos + field_info.field_length as usize]; + field_bytes.fill(0); + + // Note that since we modify the internal buffer, we don't need to re-read the + // record / buffer, meaning if a user writes then reads it should get correct + // value, and we did not re-read from file. + let mut cursor = Cursor::new(field_bytes); value .write_as(field_info, &self.file.encoding, &mut cursor) .map_err(|e| { @@ -188,6 +197,7 @@ where })?; let buffer = cursor.into_inner(); + self.file.inner.write_all(&buffer).map_err(|e| { Error::new( FieldIOError::new(ErrorKind::IoError(e), Some(field_info.clone())), @@ -195,6 +205,8 @@ where ) })?; + self.file.file_position += buffer.len() as u64; + Ok(()) } } @@ -240,17 +252,12 @@ impl<'a, T> RecordRef<'a, T> where T: Seek, { - pub fn seek_to_beginning(&mut self) -> Result { - self.file - .inner - .seek(SeekFrom::Start(self.position_in_source())) - .map_err(|e| FieldIOError::new(ErrorKind::IoError(e), None)) - } - - pub fn seek_before_deletion_flag(&mut self) -> Result { + fn seek_before_deletion_flag(&mut self) -> Result { self.file .inner - .seek(SeekFrom::Start(self.position_in_source() - 1)) + .seek(SeekFrom::Start( + self.position_in_source() - DELETION_FLAG_SIZE as u64, + )) .map_err(|e| FieldIOError::new(ErrorKind::IoError(e), None)) } } @@ -263,34 +270,29 @@ where /// /// - true -> the record is marked as deleted /// - false -> the record is **not** marked as deleted - pub fn is_deleted(&mut self) -> Result { - let deletion_flag_pos = self.position_in_source() - DELETION_FLAG_SIZE as u64; - self.file - .inner - .seek(SeekFrom::Start(deletion_flag_pos)) - .map_err(|error| Error::io_error(error, self.index.0))?; - - let deletion_flag = DeletionFlag::read_from(&mut self.file.inner) - .map_err(|error| Error::io_error(error, self.index.0))?; + pub fn is_deleted(&self) -> Result { + let deletion_flag = DeletionFlag::from_byte(self.file.record_data_buffer.get_ref()[0]); Ok(deletion_flag == DeletionFlag::Deleted) } + /// Reads the record pub fn read(&mut self) -> Result { self.read_as() } + /// Reads the record as the given type pub fn read_as(&mut self) -> Result where R: ReadableRecord, { - self.seek_to_beginning() - .map_err(|error| Error::new(error, self.index.0))?; - - let mut field_iterator = FieldIterator::<_, Cursor>> { - source: &mut self.file.inner, + self.file + .record_data_buffer + .set_position(DELETION_FLAG_SIZE as u64); + let mut field_iterator = FieldIterator { + source: &mut self.file.record_data_buffer, fields_info: self.file.fields_info.iter().peekable(), - memo_reader: &mut None, + memo_reader: &mut self.file.memo_reader, field_data_buffer: &mut self.file.field_data_buffer, encoding: &self.file.encoding, options: self.file.options, @@ -310,23 +312,36 @@ where where R: WritableRecord, { - self.seek_before_deletion_flag() - .map_err(|error| Error::new(error, self.index.0))?; + self.file.record_data_buffer.get_mut().fill(0); + self.file.record_data_buffer.get_mut()[0] = DeletionFlag::NotDeleted.to_byte(); + self.file.record_data_buffer.set_position(1); let mut field_writer = FieldWriter { - dst: &mut self.file.inner, + dst: &mut self.file.record_data_buffer, fields_info: self.file.fields_info.iter().peekable(), field_buffer: &mut Cursor::new(&mut self.file.field_data_buffer), encoding: &self.file.encoding, }; - field_writer - .write_deletion_flag() - .map_err(|error| Error::io_error(error, self.index.0))?; - record .write_using(&mut field_writer) - .map_err(|error| Error::new(error, self.index.0)) + .map_err(|error| Error::new(error, self.index.0))?; + + self.seek_before_deletion_flag() + .map_err(|error| Error::new(error, self.index.0))?; + + self.file + .inner + .write_all(self.file.record_data_buffer.get_ref()) + .map_err(|error| Error::io_error(error, self.index.0))?; + + // We don't need to update the file's inner position as we re-wrote the whole record + debug_assert_eq!( + self.file.file_position, + self.file.inner.seek(SeekFrom::Current(0)).unwrap() + ); + + Ok(()) } } @@ -336,20 +351,18 @@ pub struct FileRecordIterator<'a, T> { current_record: RecordIndex, } -impl<'a, T> FileRecordIterator<'a, T> { +impl<'a, T> FileRecordIterator<'a, T> +where + T: Seek + Read, +{ // To implement iterator we need the Iterator trait to make use of GATs // which is not the case, to iteration will have to use the while let Some() pattern pub fn next<'s>(&'s mut self) -> Option> { - if self.current_record.0 >= self.file.header.num_records as usize { - None - } else { - let r = RecordRef { - file: &mut self.file, - index: self.current_record, - }; - self.current_record.0 += 1; - Some(r) + let record_ref = self.file.record(self.current_record.0); + if let Some(_) = record_ref { + self.current_record.0 += 1 } + record_ref } } @@ -394,22 +407,30 @@ impl<'a, T> FileRecordIterator<'a, T> { /// ``` pub struct File { pub(crate) inner: T, + memo_reader: Option>, pub(crate) header: Header, - pub(crate) fields_info: Vec, + pub(crate) fields_info: FieldsInfo, pub(crate) encoding: DynEncoding, + /// Buffer that contains a whole record worth of data + /// It also contains the deletion flag + record_data_buffer: Cursor>, /// Non-Memo field length is stored on a u8, /// so fields cannot exceed 255 bytes field_data_buffer: [u8; 255], pub(crate) options: ReadingOptions, + /// We track the position in the file + /// to avoid calling `seek` when we are reading buffer + /// in order (0, 1, 2, etc) + file_position: u64, } impl File { /// Returns the information about fields present in the records pub fn fields(&self) -> &[FieldInfo] { - self.fields_info.as_slice() + self.fields_info.as_ref() } - /// Returns the field infex that corresponds to the given name + /// Returns the field index that corresponds to the given name pub fn field_index(&self, name: &str) -> Option { self.fields_info .iter() @@ -428,7 +449,7 @@ impl File { } impl File { - /// creates of File using source as the storate space. + /// creates of File using source as the storage space. pub fn open(mut source: T) -> Result { let header = Header::read_from(&mut source).map_err(|error| Error::io_error(error, 0))?; @@ -443,15 +464,12 @@ impl File { let num_fields = (offset as usize - Header::SIZE - std::mem::size_of::()) / FieldInfo::SIZE; - let mut fields_info = Vec::::with_capacity(num_fields as usize); - for _ in 0..num_fields { - let info = FieldInfo::read_from(&mut source).map_err(|error| Error { + let fields_info = + FieldsInfo::read_from(&mut source, num_fields).map_err(|error| Error { record_num: 0, field: None, kind: error, })?; - fields_info.push(info); - } let terminator = source .read_u8() @@ -467,14 +485,21 @@ impl File { let field_error = FieldIOError::new(UnsupportedCodePage(header.code_page_mark), None); Error::new(field_error, 0) })?; + + let record_size: usize = DELETION_FLAG_SIZE + fields_info.size_of_all_fields(); + let record_data_buffer = Cursor::new(vec![0u8; record_size]); + // debug_assert_eq!(record_size - DELETION_FLAG_SIZE, header.size_of_record as usize); + Ok(Self { inner: source, - // memo_reader: None, + memo_reader: None, header, fields_info, encoding, + record_data_buffer, field_data_buffer: [0u8; 255], options: ReadingOptions::default(), + file_position: header.offset_to_first_record as u64, }) } @@ -485,10 +510,30 @@ impl File { if index >= self.header.num_records as usize { None } else { - Some(RecordRef { + let record_ref = RecordRef { file: self, index: RecordIndex(index), - }) + }; + let pos = record_ref.position_in_source() - DELETION_FLAG_SIZE as u64; + drop(record_ref); + + if pos != self.file_position { + self.file_position = self + .inner + .seek(SeekFrom::Start(pos)) + .map_err(|e| FieldIOError::new(ErrorKind::IoError(e), None)) + .unwrap() + } + + self.inner + .read_exact(self.record_data_buffer.get_mut()) + .unwrap(); + self.file_position += self.record_data_buffer.get_mut().len() as u64; + let record_ref = RecordRef { + file: self, + index: RecordIndex(index), + }; + Some(record_ref) } } @@ -506,14 +551,27 @@ impl File { impl File { pub fn create_new(mut dst: T, table_info: TableInfo) -> Result { write_header_parts(&mut dst, &table_info.header, &table_info.fields_info)?; - + let record_size: usize = DELETION_FLAG_SIZE + + table_info + .fields_info + .iter() + .map(|i| i.field_length as usize) + .sum::(); + let record_data_buffer = Cursor::new(vec![0u8; record_size]); + let file_position = table_info.header.offset_to_first_record as u64; + debug_assert_eq!(file_position, dst.stream_position().unwrap()); Ok(Self { inner: dst, + memo_reader: None, header: table_info.header, - fields_info: table_info.fields_info, + fields_info: FieldsInfo { + inner: table_info.fields_info, + }, encoding: table_info.encoding, + record_data_buffer, field_data_buffer: [0u8; 255], options: ReadingOptions::default(), + file_position, }) } @@ -593,9 +651,31 @@ impl File { /// Opens an existing dBase file in read only mode pub fn open_read_only>(path: P) -> Result { - let file = std::fs::File::open(path).map_err(|error| Error::io_error(error, 0))?; - - File::open(BufReadWriteFile::new(file).unwrap()) + let file = std::fs::File::open(path.as_ref()).map_err(|error| Error::io_error(error, 0))?; + + let mut file = File::open(BufReadWriteFile::new(file).unwrap())?; + if file.fields_info.at_least_one_field_is_memo() { + let p = path.as_ref(); + let memo_type = file.header.file_type.supported_memo_type(); + if let Some(mt) = memo_type { + let memo_path = p.with_extension(mt.extension()); + + let memo_file = std::fs::File::open(memo_path).map_err(|error| Error { + record_num: 0, + field: None, + kind: ErrorKind::ErrorOpeningMemoFile(error), + })?; + + let memo_reader = + BufReadWriteFile::new(memo_file) + .and_then(|memo_file| { + MemoReader::new(mt, memo_file) + }).map_err(|error| Error::io_error(error, 0))?; + + file.memo_reader = Some(memo_reader); + } + } + Ok(file) } /// Opens an existing dBase file in write only mode diff --git a/src/lib.rs b/src/lib.rs index 802dd75..4f2b41a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,7 +288,7 @@ mod reading; mod record; mod writing; -pub use file::{FieldRef, File, RecordRef}; +pub use file::{FieldIndex, FieldRef, File, RecordIndex, RecordRef}; #[cfg(feature = "datafusion")] pub use crate::datafusion::{DbaseTable, DbaseTableFactory}; diff --git a/src/memo.rs b/src/memo.rs index 82fc240..39a156f 100644 --- a/src/memo.rs +++ b/src/memo.rs @@ -9,6 +9,15 @@ pub(crate) enum MemoFileType { FoxBaseMemo, } +impl MemoFileType { + pub(crate) const fn extension(self) -> &'static str { + match self { + MemoFileType::DbaseMemo | MemoFileType::DbaseMemo4 => "dbt", + MemoFileType::FoxBaseMemo => "fpt", + } + } +} + /// Although there are different memo file type with each a different /// header organisation, we use the same struct internally #[derive(Debug, Copy, Clone)] @@ -45,7 +54,7 @@ impl MemoHeader { /// Struct that reads knows how to read data from a memo source #[derive(Debug, Clone)] -pub(crate) struct MemoReader { +pub(crate) struct MemoReader { memo_file_type: MemoFileType, header: MemoHeader, source: T, diff --git a/src/reading.rs b/src/reading.rs index 70e75f3..f9876f0 100644 --- a/src/reading.rs +++ b/src/reading.rs @@ -152,7 +152,7 @@ impl ReaderBuilder { source: file.inner, memo_reader, header: file.header, - fields_info: file.fields_info, + fields_info: file.fields_info.inner, encoding: self .encoding .map_or_else(|| file.encoding, DynEncoding::new), @@ -208,7 +208,7 @@ impl Reader { source: file.inner, memo_reader: None, header: file.header, - fields_info: file.fields_info, + fields_info: file.fields_info.inner, encoding: file.encoding, options: ReadingOptions::default(), }) diff --git a/src/writing.rs b/src/writing.rs index 80defb0..6940198 100644 --- a/src/writing.rs +++ b/src/writing.rs @@ -252,7 +252,7 @@ impl TableWriterBuilder { let size_of_record = self .v .iter() - .fold(1u16, |s, info| s + info.field_length as u16); + .fold(0u16, |s, info| s + info.field_length as u16); self.hdr.offset_to_first_record = offset_to_first_record as u16; self.hdr.size_of_record = size_of_record;