From 125907cabad70c08bb85a9e5a521580135eff686 Mon Sep 17 00:00:00 2001 From: Mossa Date: Mon, 12 Feb 2024 14:28:44 +0100 Subject: [PATCH] extendr-api: String conversions are now using length from CHARSXP directly extendr-api: Use safe CStr -> String conversion. It does the same as the code removed. extendr-api: Refactored scalar .as_str --- extendr-api/src/iter.rs | 35 +++++++------ extendr-api/src/robj/into_robj.rs | 2 + extendr-api/src/robj/mod.rs | 39 +++++++-------- extendr-api/src/robj/tests.rs | 2 + extendr-api/src/wrapper/pairlist.rs | 8 +-- extendr-api/src/wrapper/rstr.rs | 41 ++++++++++----- extendr-api/src/wrapper/symbol.rs | 7 ++- extendr-api/tests/string_tests.rs | 77 +++++++++++++++++++++++++++++ 8 files changed, 158 insertions(+), 53 deletions(-) create mode 100644 extendr-api/tests/string_tests.rs diff --git a/extendr-api/src/iter.rs b/extendr-api/src/iter.rs index 9a24d87e0b..d3a82353a3 100644 --- a/extendr-api/src/iter.rs +++ b/extendr-api/src/iter.rs @@ -55,23 +55,24 @@ impl StrIter { } } -// Get a string reference from a CHARSXP +// Get a string reference from a `CHARSXP` fn str_from_strsxp<'a>(sexp: SEXP, index: isize) -> &'a str { single_threaded(|| unsafe { - if index < 0 || index >= Rf_xlength(sexp) { - <&str>::na() - } else { - let charsxp = STRING_ELT(sexp, index); - if charsxp == R_NaString { - <&str>::na() - } else if TYPEOF(charsxp) == CHARSXP { - let ptr = R_CHAR(charsxp) as *const u8; - let slice = std::slice::from_raw_parts(ptr, Rf_xlength(charsxp) as usize); - std::str::from_utf8_unchecked(slice) - } else { - <&str>::na() - } + let charsxp = STRING_ELT(sexp, index); + //TODO: this can be replaced with Robj::as_str, but it isn't + // because of the allocation-protection mechanism that would be + // unnecessary. Use when RawRobj is a thing + if charsxp == R_NaString { + return <&str>::na(); } + if charsxp == R_BlankString { + return ""; + } + // if `CHARSXP`, then length is number of non-null bytes. + // assert_eq!(TYPEOF(sexp), CHARSXP); + let length = Rf_xlength(charsxp); + let all_bytes = std::slice::from_raw_parts(R_CHAR(charsxp) as _, length as _); + std::str::from_utf8_unchecked(all_bytes) }) } @@ -92,10 +93,16 @@ impl Iterator for StrIter { } else if TYPEOF(vector) == STRSXP { Some(str_from_strsxp(vector, i as isize)) } else if TYPEOF(vector) == INTSXP && TYPEOF(self.levels) == STRSXP { + // factor support: factor is an integer, and we need + // the value of it, to retrieve the assigned label let j = *(INTEGER(vector).add(i)); + // assert_eq!(TYPEOF(self.levels), STRSXP, "levels of a factor must always be a character-vector"); + // assert_ne!(j, 0, "invalid factor, where level/label i 0-indexed"); Some(str_from_strsxp(self.levels, j as isize - 1)) } else if TYPEOF(vector) == NILSXP { Some(<&str>::na()) + } else if vector == R_NaString { + Some(<&str>::na()) } else { None } diff --git a/extendr-api/src/robj/into_robj.rs b/extendr-api/src/robj/into_robj.rs index b0b6a32069..ddadd54755 100644 --- a/extendr-api/src/robj/into_robj.rs +++ b/extendr-api/src/robj/into_robj.rs @@ -10,6 +10,7 @@ pub(crate) fn str_to_character(s: &str) -> SEXP { R_NaString } else { single_threaded(|| { + // this function embeds a terminating NULL Rf_mkCharLenCE( s.as_ptr() as *const raw::c_char, s.len() as i32, @@ -709,6 +710,7 @@ impl From> for Robj { #[cfg(test)] mod test { use super::*; + use crate as extendr_api; #[test] fn test_vec_rint_to_robj() { diff --git a/extendr-api/src/robj/mod.rs b/extendr-api/src/robj/mod.rs index 09247af67d..af5af930b1 100644 --- a/extendr-api/src/robj/mod.rs +++ b/extendr-api/src/robj/mod.rs @@ -329,7 +329,7 @@ pub trait Types: GetSexp { impl Types for Robj {} impl Robj { - /// Is this object is an `NA` scalar? + /// Is this object an `NA` scalar? /// Works for character, integer and numeric types. /// /// ``` @@ -348,6 +348,9 @@ impl Robj { unsafe { let sexp = self.get(); match self.sexptype() { + // a character vector contains `CHARSXP`, and thus you + // seldomly have `Robj`'s that are `CHARSXP` themselves + CHARSXP => sexp == libR_sys::R_NaString, STRSXP => STRING_ELT(sexp, 0) == libR_sys::R_NaString, INTSXP => *(INTEGER(sexp)) == libR_sys::R_NaInt, LGLSXP => *(LOGICAL(sexp)) == libR_sys::R_NaInt, @@ -588,18 +591,26 @@ impl Robj { /// ``` pub fn as_str<'a>(&self) -> Option<&'a str> { unsafe { - match self.sexptype() { + let charsxp = match self.sexptype() { STRSXP => { if self.len() != 1 { - None - } else { - Some(to_str(R_CHAR(STRING_ELT(self.get(), 0)) as *const u8)) + return None; } + Some(STRING_ELT(self.get(), 0)) } - // CHARSXP => Some(to_str(R_CHAR(self.get()) as *const u8)), - // SYMSXP => Some(to_str(R_CHAR(PRINTNAME(self.get())) as *const u8)), + CHARSXP => Some(self.get()), + SYMSXP => Some(PRINTNAME(self.get())), _ => None, + }; + + let charsxp = charsxp?; + if charsxp == R_NaString { + return Some(<&str>::na()); } + + let length = Rf_xlength(charsxp); + let all_bytes = std::slice::from_raw_parts(R_CHAR(charsxp) as _, length as _); + Some(std::str::from_utf8_unchecked(all_bytes)) } } @@ -1075,20 +1086,6 @@ impl PartialEq for Robj { } } -// Internal utf8 to str conversion. -// Lets not worry about non-ascii/unicode strings for now (or ever). -pub(crate) unsafe fn to_str<'a>(ptr: *const u8) -> &'a str { - let mut len = 0; - loop { - if *ptr.offset(len) == 0 { - break; - } - len += 1; - } - let slice = std::slice::from_raw_parts(ptr, len as usize); - std::str::from_utf8_unchecked(slice) -} - /// Release any owned objects. impl Drop for Robj { fn drop(&mut self) { diff --git a/extendr-api/src/robj/tests.rs b/extendr-api/src/robj/tests.rs index 9f6626a906..8fb21722a3 100644 --- a/extendr-api/src/robj/tests.rs +++ b/extendr-api/src/robj/tests.rs @@ -1,6 +1,8 @@ use crate::scalar::*; use crate::*; +use crate as extendr_api; + #[test] fn test_from_robj() { test! { diff --git a/extendr-api/src/wrapper/pairlist.rs b/extendr-api/src/wrapper/pairlist.rs index 4c9992b3c0..84978b2ad4 100644 --- a/extendr-api/src/wrapper/pairlist.rs +++ b/extendr-api/src/wrapper/pairlist.rs @@ -139,9 +139,11 @@ impl Iterator for PairlistIter { self.list_elem = CDR(sexp); if TYPEOF(tag) == SYMSXP { let printname = PRINTNAME(tag); - assert!(TYPEOF(printname) == CHARSXP); - let name = to_str(R_CHAR(printname) as *const u8); - Some((std::mem::transmute(name), value)) + // assert_eq!(TYPEOF(printname), CHARSXP); // printname is always a CHARSXP + //TODO: use RawRobj here once it is a thing + let length = Rf_xlength(printname); + let all_bytes = std::slice::from_raw_parts(R_CHAR(printname) as _, length as _); + Some((std::str::from_utf8_unchecked(all_bytes), value)) } else { // empty string represents the absense of the name Some(("", value)) diff --git a/extendr-api/src/wrapper/rstr.rs b/extendr-api/src/wrapper/rstr.rs index ee63c0ddb7..27b9902395 100644 --- a/extendr-api/src/wrapper/rstr.rs +++ b/extendr-api/src/wrapper/rstr.rs @@ -1,6 +1,6 @@ use super::*; -/// Wrapper for creating CHARSXP objects. +/// Wrapper for creating `CHARSXP` objects. /// These are used only as the contents of a character /// vector. /// @@ -17,14 +17,6 @@ pub struct Rstr { pub(crate) robj: Robj, } -pub(crate) unsafe fn sexp_to_str(sexp: SEXP) -> &'static str { - if sexp == R_NaString { - <&str>::na() - } else { - std::mem::transmute(to_str(R_CHAR(sexp) as *const u8)) - } -} - impl Rstr { /// Make a character object from a string. pub fn from_string(val: &str) -> Self { @@ -36,26 +28,30 @@ impl Rstr { /// Get the string from a character object. /// If the string is NA, then the special na_str() is returned. pub fn as_str(&self) -> &str { - unsafe { sexp_to_str(self.robj.get()) } + if unsafe { self.robj.get() == R_NaString } { + <&str>::na() + } else { + self.into() + } } } impl AsRef for Rstr { - /// Treat a Rstr as a string slice. + /// Treat a `Rstr` as a string slice. fn as_ref(&self) -> &str { self.as_str() } } impl From for Rstr { - /// Convert a String to a Rstr. + /// Convert a `String` to a `Rstr`. fn from(s: String) -> Self { Rstr::from_string(&s) } } impl From<&str> for Rstr { - /// Convert a string slice to a Rstr. + /// Convert a `String` slice to a `Rstr`. fn from(s: &str) -> Self { Rstr::from_string(s) } @@ -71,6 +67,25 @@ impl From> for Rstr { } } +impl From<&Rstr> for &str { + fn from(value: &Rstr) -> Self { + unsafe { + let sexp = value.robj.get(); + if sexp == R_NaString { + return Self::na(); + } + if sexp == R_BlankString { + return ""; + } + // if `CHARSXP`, then length is number of non-null bytes. + assert_eq!(TYPEOF(sexp), CHARSXP); + let length = Rf_xlength(sexp); + let all_bytes = std::slice::from_raw_parts(R_CHAR(sexp) as _, length as _); + std::str::from_utf8_unchecked(all_bytes) + } + } +} + impl Deref for Rstr { type Target = str; diff --git a/extendr-api/src/wrapper/symbol.rs b/extendr-api/src/wrapper/symbol.rs index 688503828e..436f623484 100644 --- a/extendr-api/src/wrapper/symbol.rs +++ b/extendr-api/src/wrapper/symbol.rs @@ -53,8 +53,11 @@ impl Symbol { unsafe { let sexp = self.robj.get(); let printname = PRINTNAME(sexp); - assert!(TYPEOF(printname) == CHARSXP); - to_str(R_CHAR(printname) as *const u8) + //TODO: Use `RawRobj` when it is a thing + // assert_eq!(TYPEOF(printname), CHARSXP); // printname is always CHARSXP + let length = Rf_xlength(printname); + let all_bytes = std::slice::from_raw_parts(R_CHAR(printname) as _, length as _); + std::str::from_utf8_unchecked(all_bytes) } } } diff --git a/extendr-api/tests/string_tests.rs b/extendr-api/tests/string_tests.rs new file mode 100644 index 0000000000..10ecbe7de1 --- /dev/null +++ b/extendr-api/tests/string_tests.rs @@ -0,0 +1,77 @@ +// use std::ffi::CStr; +#[allow(unused_imports)] +use extendr_api::{r, GetSexp, Robj}; +use extendr_engine::with_r; +use extendr_macros::R; +#[allow(unused_imports)] +use libR_sys::{ + R_BlankScalarString, R_NilValue, R_ParseString, Rf_PrintValue, Rf_xlength, STRING_ELT, TYPEOF, +}; +#[allow(unused_imports)] +use libR_sys::{R_BlankString, R_NaString, CHARSXP, STRSXP}; + +/// This is a "test" to print out all the assumptions on strings we have, +/// from empty, NA, and multi-byte UTF8 characters +#[test] +fn test_byte_length() { + // assuming that for a CHARSXP, the function Rf_xlength + with_r(|| unsafe { + // what is a Scalar NA string? + + let na_character_ = R!(r#"NA_character_"#).unwrap(); + assert_eq!(TYPEOF(na_character_.get()), STRSXP); + let string_with_na = R!(r#"c(NA_character_)"#).unwrap(); + assert_eq!(TYPEOF(string_with_na.get()), STRSXP); + assert_ne!( + string_with_na.get(), + R_NaString, + "a character vector with single NA is not NA" + ); + assert_ne!( + string_with_na.get(), + R_BlankScalarString, + "c(NA_character_) is not BlankScalarString" + ); + assert_eq!(TYPEOF(STRING_ELT(string_with_na.get(), 0)), CHARSXP); + assert_eq!( + STRING_ELT(string_with_na.get(), 0), + R_NaString, + "the CHARSXP is NA" + ); + + let r_str_vec = R!(r#"c(NA_character_, "", "1", "123", "✅")"#).unwrap(); + Rf_PrintValue(r_str_vec.get()); + + // let r_str_vec_ref = &r_str_vec; + // Rf_PrintValue(R!("length({{r_str_vec_ref}})").unwrap().get()); + + dbg!( + TYPEOF(r_str_vec.get()), + Rf_xlength(r_str_vec.get()), + // Rf_length(r_str_vec.get()) + ); + + for i in 0..Rf_xlength(r_str_vec.get()) { + dbg!(i); + let elt = STRING_ELT(r_str_vec.get(), i); + // print!("elt ="); + Rf_PrintValue(elt); + dbg!( + TYPEOF(elt), + Rf_xlength(elt), + libR_sys::Rf_length(elt), + elt == libR_sys::R_NaString, + elt == libR_sys::R_BlankString, + elt == libR_sys::R_BlankScalarString, + // same as `Rf_xlength` anyways + // R_nchar( + // elt, + // nchar_type_Bytes, + // libR_sys::Rboolean::FALSE, + // libR_sys::Rboolean::TRUE, + // ptr::null() + // ) + ); + } + }); +}