From ef6029bfd8485fb20fb4d502c48c1ed60708caa1 Mon Sep 17 00:00:00 2001 From: luocheng <805914196@qq.com> Date: Tue, 26 Nov 2024 20:05:20 +0800 Subject: [PATCH 1/3] Use encoding_rs to add more character encoding support, including Chinese character encoding, such as GBK --- src/bin/cli.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/bin/cli.rs diff --git a/src/bin/cli.rs b/src/bin/cli.rs new file mode 100644 index 0000000..e69de29 From 3d0e3474ddab54d4750a0c6325fdf7f17299d6eb Mon Sep 17 00:00:00 2001 From: luocheng <805914196@qq.com> Date: Wed, 27 Nov 2024 10:19:36 +0800 Subject: [PATCH 2/3] Use encoding_rs to add more character encoding support, including Chinese character encoding, such as GBK --- Cargo.toml | 2 ++ src/bin/cli.rs | 14 +++++++++++++ src/encoding.rs | 49 +++++++++++++++++++++++++++++++++++++++++++ src/header.rs | 34 ++++++++++++++++++++++++++++-- tests/data/cp936.dbf | Bin 0 -> 117 bytes tests/tests.rs | 16 ++++++++++++++ 6 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 tests/data/cp936.dbf diff --git a/Cargo.toml b/Cargo.toml index 23a62d9..12d10c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ yore = { version = "1.0.1", optional = true } datafusion = { version = "31", optional = true } datafusion-expr = { version = "31", optional = true } async-trait = { version = "0.1", optional = true } +codepage= "0.1.2" +encoding_rs = "0.8.35" [dev-dependencies] serde_derive = "1.0.102" diff --git a/src/bin/cli.rs b/src/bin/cli.rs index e69de29..c5ab6be 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -0,0 +1,14 @@ + +fn main() { + let cp936_dbf: &str = "tests/data/cp936.dbf"; + + //let mut reader = dbase::Reader::from_path(cp850_dbf).unwrap(); + let mut reader = dbase::Reader::from_path(cp936_dbf).unwrap(); + let records = reader.read().unwrap(); + for record in records { + println!("{:?}", record.get("TEST")); + } + +} + + diff --git a/src/encoding.rs b/src/encoding.rs index 20b90e9..4ce8b5e 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -167,6 +167,55 @@ impl Encoding for DynEncoding { } } + +pub struct EncodingRs(&'static encoding_rs::Encoding); +impl From<&'static encoding_rs::Encoding> for EncodingRs { + fn from(item: &'static encoding_rs::Encoding) -> Self { + EncodingRs(item) + } +} +impl Clone for EncodingRs { + fn clone(&self) -> Self { + EncodingRs(self.0.clone()) + } +} + +impl AsCodePageMark for EncodingRs { + fn code_page_mark(&self) -> crate::CodePageMark { + let code_page = codepage::from_encoding(self.0).unwrap(); + match code_page { + 1252 => crate::CodePageMark::CP1252, + 866 => crate::CodePageMark::CP866, + 874 => crate::CodePageMark::CP874, + 1255 => crate::CodePageMark::CP1255, + 1256 => crate::CodePageMark::CP1256, + 1250 => crate::CodePageMark::CP1250, + 1251 => crate::CodePageMark::CP1251, + 1254 => crate::CodePageMark::CP1254, + 1253 => crate::CodePageMark::CP1253, + 65001 => crate::CodePageMark::Utf8, + 950 => crate::CodePageMark::CP950, + 949 => crate::CodePageMark::CP949, + 936 => crate::CodePageMark::CP936, + 932 => crate::CodePageMark::CP932, + _=> crate::CodePageMark::Utf8, + } + + } +} + +impl Encoding for EncodingRs +{ + fn decode<'a>(&self, bytes: &'a [u8]) -> Result, DecodeError> { + Ok(self.0.decode(bytes).0) + } + + fn encode<'a>(&self, s: &'a str) -> Result, EncodeError> { + Ok(self.0.encode(s).0) + } +} + + #[cfg(feature = "yore")] impl Encoding for T where diff --git a/src/header.rs b/src/header.rs index efa13b8..d9ced1a 100644 --- a/src/header.rs +++ b/src/header.rs @@ -1,6 +1,6 @@ use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use crate::encoding::DynEncoding; +use crate::encoding::{DynEncoding, EncodingRs}; use std::io::{Read, Write}; use crate::field::types::Date; @@ -94,7 +94,36 @@ impl CodePageMark { } #[cfg(not(feature = "yore"))] { - Some(DynEncoding::new(crate::encoding::UnicodeLossy)) + //Some(DynEncoding::new(crate::encoding::UnicodeLossy)) + let code_page = match self { + //CodePageMark::CP437 => 437, + //CodePageMark::CP850 => 850, + CodePageMark::CP1252 => 1252, + //CodePageMark::CP852 => 852 , + CodePageMark::CP866 => 866 , + //CodePageMark::CP865 => 865 , + //CodePageMark::CP861 => 861, + CodePageMark::CP874 => 874, + CodePageMark::CP1255 => 1255, + CodePageMark::CP1256 => 1256, + CodePageMark::CP1250 => 1250, + CodePageMark::CP1251 => 1251, + CodePageMark::CP1254 => 1254, + CodePageMark::CP1253 => 1253, + CodePageMark::Utf8 => 65001, + //CodePageMark::CP895 => 895, + //CodePageMark::CP620 => 620, + //CodePageMark::CP737 => 737, + //CodePageMark::CP857 => 857, + CodePageMark::CP950 => 950, + CodePageMark::CP949 => 949, + CodePageMark::CP936 => 936, + CodePageMark::CP932 => 932, + _ => 65001 + }; + let gbk :EncodingRs = (codepage::to_encoding(code_page).unwrap()).into(); + Some(DynEncoding::new(gbk)) + } } } @@ -106,6 +135,7 @@ impl From for CodePageMark { 0x01 => Self::CP437, 0x02 => Self::CP850, 0x03 => Self::CP1252, + 0x4D => Self::CP936, // 0x04 => Self::StandardMacIntosh, 0x64 => Self::CP852, 0x65 => Self::CP866, diff --git a/tests/data/cp936.dbf b/tests/data/cp936.dbf new file mode 100644 index 0000000000000000000000000000000000000000..076398f163ce5d1277640fc16550f42083a10f4e GIT binary patch literal 117 ycmZRs;g(`#U|?`$Fb0xnz?Xp`#5FhsD(VcPjnI_vDr|am>dLhX=Z+{44Wt0iz7Gum literal 0 HcmV?d00001 diff --git a/tests/tests.rs b/tests/tests.rs index 1b3784a..929146d 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -18,6 +18,9 @@ const STATIONS_WITH_DELETED: &str = "./tests/data/stations_with_deleted.dbf"; #[cfg(feature = "yore")] const CP850_DBF: &str = "tests/data/cp850.dbf"; + +const CP936_DBF: &str = "tests/data/cp936.dbf"; + fn write_read_compare(records: &Vec, writer_builder: TableWriterBuilder) where R: WritableRecord + ReadableRecord + Debug + PartialEq, @@ -361,3 +364,16 @@ fn test_record_marked_as_deleted_are_skipped_by_reader() -> Result<(), Box Date: Wed, 27 Nov 2024 10:20:46 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=88=A0=E9=99=A4cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bin/cli.rs | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 src/bin/cli.rs diff --git a/src/bin/cli.rs b/src/bin/cli.rs deleted file mode 100644 index c5ab6be..0000000 --- a/src/bin/cli.rs +++ /dev/null @@ -1,14 +0,0 @@ - -fn main() { - let cp936_dbf: &str = "tests/data/cp936.dbf"; - - //let mut reader = dbase::Reader::from_path(cp850_dbf).unwrap(); - let mut reader = dbase::Reader::from_path(cp936_dbf).unwrap(); - let records = reader.read().unwrap(); - for record in records { - println!("{:?}", record.get("TEST")); - } - -} - -