Skip to content

Commit

Permalink
extendr-api: String conversions are now using
Browse files Browse the repository at this point in the history
length from CHARSXP directly

extendr-api: Use safe CStr -> String conversion.
It does the same as the code removed.

extendr-api: Refactored scalar .as_str
  • Loading branch information
CGMossa committed Feb 12, 2024
1 parent 941607e commit 125907c
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 53 deletions.
35 changes: 21 additions & 14 deletions extendr-api/src/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,23 +55,24 @@ impl StrIter {
}
}

// Get a string reference from a CHARSXP
// Get a string reference from a `CHARSXP`
fn str_from_strsxp<'a>(sexp: SEXP, index: isize) -> &'a str {
single_threaded(|| unsafe {
if index < 0 || index >= Rf_xlength(sexp) {
<&str>::na()
} else {
let charsxp = STRING_ELT(sexp, index);
if charsxp == R_NaString {
<&str>::na()
} else if TYPEOF(charsxp) == CHARSXP {
let ptr = R_CHAR(charsxp) as *const u8;
let slice = std::slice::from_raw_parts(ptr, Rf_xlength(charsxp) as usize);
std::str::from_utf8_unchecked(slice)
} else {
<&str>::na()
}
let charsxp = STRING_ELT(sexp, index);
//TODO: this can be replaced with Robj::as_str, but it isn't
// because of the allocation-protection mechanism that would be
// unnecessary. Use when RawRobj is a thing
if charsxp == R_NaString {
return <&str>::na();
}
if charsxp == R_BlankString {
return "";
}
// if `CHARSXP`, then length is number of non-null bytes.
// assert_eq!(TYPEOF(sexp), CHARSXP);
let length = Rf_xlength(charsxp);
let all_bytes = std::slice::from_raw_parts(R_CHAR(charsxp) as _, length as _);
std::str::from_utf8_unchecked(all_bytes)
})
}

Expand All @@ -92,10 +93,16 @@ impl Iterator for StrIter {
} else if TYPEOF(vector) == STRSXP {
Some(str_from_strsxp(vector, i as isize))
} else if TYPEOF(vector) == INTSXP && TYPEOF(self.levels) == STRSXP {
// factor support: factor is an integer, and we need
// the value of it, to retrieve the assigned label
let j = *(INTEGER(vector).add(i));
// assert_eq!(TYPEOF(self.levels), STRSXP, "levels of a factor must always be a character-vector");
// assert_ne!(j, 0, "invalid factor, where level/label i 0-indexed");
Some(str_from_strsxp(self.levels, j as isize - 1))
} else if TYPEOF(vector) == NILSXP {
Some(<&str>::na())
} else if vector == R_NaString {
Some(<&str>::na())
} else {
None
}
Expand Down
2 changes: 2 additions & 0 deletions extendr-api/src/robj/into_robj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pub(crate) fn str_to_character(s: &str) -> SEXP {
R_NaString
} else {
single_threaded(|| {
// this function embeds a terminating NULL
Rf_mkCharLenCE(
s.as_ptr() as *const raw::c_char,
s.len() as i32,
Expand Down Expand Up @@ -709,6 +710,7 @@ impl From<Vec<Rstr>> for Robj {
#[cfg(test)]
mod test {
use super::*;
use crate as extendr_api;

#[test]
fn test_vec_rint_to_robj() {
Expand Down
39 changes: 18 additions & 21 deletions extendr-api/src/robj/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ pub trait Types: GetSexp {
impl Types for Robj {}

impl Robj {
/// Is this object is an `NA` scalar?
/// Is this object an `NA` scalar?
/// Works for character, integer and numeric types.
///
/// ```
Expand All @@ -348,6 +348,9 @@ impl Robj {
unsafe {
let sexp = self.get();
match self.sexptype() {
// a character vector contains `CHARSXP`, and thus you
// seldomly have `Robj`'s that are `CHARSXP` themselves
CHARSXP => sexp == libR_sys::R_NaString,
STRSXP => STRING_ELT(sexp, 0) == libR_sys::R_NaString,
INTSXP => *(INTEGER(sexp)) == libR_sys::R_NaInt,
LGLSXP => *(LOGICAL(sexp)) == libR_sys::R_NaInt,
Expand Down Expand Up @@ -588,18 +591,26 @@ impl Robj {
/// ```
pub fn as_str<'a>(&self) -> Option<&'a str> {
unsafe {
match self.sexptype() {
let charsxp = match self.sexptype() {
STRSXP => {
if self.len() != 1 {
None
} else {
Some(to_str(R_CHAR(STRING_ELT(self.get(), 0)) as *const u8))
return None;
}
Some(STRING_ELT(self.get(), 0))
}
// CHARSXP => Some(to_str(R_CHAR(self.get()) as *const u8)),
// SYMSXP => Some(to_str(R_CHAR(PRINTNAME(self.get())) as *const u8)),
CHARSXP => Some(self.get()),
SYMSXP => Some(PRINTNAME(self.get())),
_ => None,
};

let charsxp = charsxp?;
if charsxp == R_NaString {
return Some(<&str>::na());
}

let length = Rf_xlength(charsxp);
let all_bytes = std::slice::from_raw_parts(R_CHAR(charsxp) as _, length as _);
Some(std::str::from_utf8_unchecked(all_bytes))
}
}

Expand Down Expand Up @@ -1075,20 +1086,6 @@ impl PartialEq<Robj> for Robj {
}
}

// Internal utf8 to str conversion.
// Lets not worry about non-ascii/unicode strings for now (or ever).
pub(crate) unsafe fn to_str<'a>(ptr: *const u8) -> &'a str {
let mut len = 0;
loop {
if *ptr.offset(len) == 0 {
break;
}
len += 1;
}
let slice = std::slice::from_raw_parts(ptr, len as usize);
std::str::from_utf8_unchecked(slice)
}

/// Release any owned objects.
impl Drop for Robj {
fn drop(&mut self) {
Expand Down
2 changes: 2 additions & 0 deletions extendr-api/src/robj/tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use crate::scalar::*;
use crate::*;

use crate as extendr_api;

#[test]
fn test_from_robj() {
test! {
Expand Down
8 changes: 5 additions & 3 deletions extendr-api/src/wrapper/pairlist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,11 @@ impl Iterator for PairlistIter {
self.list_elem = CDR(sexp);
if TYPEOF(tag) == SYMSXP {
let printname = PRINTNAME(tag);
assert!(TYPEOF(printname) == CHARSXP);
let name = to_str(R_CHAR(printname) as *const u8);
Some((std::mem::transmute(name), value))
// assert_eq!(TYPEOF(printname), CHARSXP); // printname is always a CHARSXP
//TODO: use RawRobj here once it is a thing
let length = Rf_xlength(printname);
let all_bytes = std::slice::from_raw_parts(R_CHAR(printname) as _, length as _);
Some((std::str::from_utf8_unchecked(all_bytes), value))
} else {
// empty string represents the absense of the name
Some(("", value))
Expand Down
41 changes: 28 additions & 13 deletions extendr-api/src/wrapper/rstr.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::*;

/// Wrapper for creating CHARSXP objects.
/// Wrapper for creating `CHARSXP` objects.
/// These are used only as the contents of a character
/// vector.
///
Expand All @@ -17,14 +17,6 @@ pub struct Rstr {
pub(crate) robj: Robj,
}

pub(crate) unsafe fn sexp_to_str(sexp: SEXP) -> &'static str {
if sexp == R_NaString {
<&str>::na()
} else {
std::mem::transmute(to_str(R_CHAR(sexp) as *const u8))
}
}

impl Rstr {
/// Make a character object from a string.
pub fn from_string(val: &str) -> Self {
Expand All @@ -36,26 +28,30 @@ impl Rstr {
/// Get the string from a character object.
/// If the string is NA, then the special na_str() is returned.
pub fn as_str(&self) -> &str {
unsafe { sexp_to_str(self.robj.get()) }
if unsafe { self.robj.get() == R_NaString } {
<&str>::na()
} else {
self.into()
}
}
}

impl AsRef<str> for Rstr {
/// Treat a Rstr as a string slice.
/// Treat a `Rstr` as a string slice.
fn as_ref(&self) -> &str {
self.as_str()
}
}

impl From<String> for Rstr {
/// Convert a String to a Rstr.
/// Convert a `String` to a `Rstr`.
fn from(s: String) -> Self {
Rstr::from_string(&s)
}
}

impl From<&str> for Rstr {
/// Convert a string slice to a Rstr.
/// Convert a `String` slice to a `Rstr`.
fn from(s: &str) -> Self {
Rstr::from_string(s)
}
Expand All @@ -71,6 +67,25 @@ impl From<Option<String>> for Rstr {
}
}

impl From<&Rstr> for &str {
fn from(value: &Rstr) -> Self {
unsafe {
let sexp = value.robj.get();
if sexp == R_NaString {
return Self::na();
}
if sexp == R_BlankString {
return "";
}
// if `CHARSXP`, then length is number of non-null bytes.
assert_eq!(TYPEOF(sexp), CHARSXP);
let length = Rf_xlength(sexp);
let all_bytes = std::slice::from_raw_parts(R_CHAR(sexp) as _, length as _);
std::str::from_utf8_unchecked(all_bytes)
}
}
}

impl Deref for Rstr {
type Target = str;

Expand Down
7 changes: 5 additions & 2 deletions extendr-api/src/wrapper/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ impl Symbol {
unsafe {
let sexp = self.robj.get();
let printname = PRINTNAME(sexp);
assert!(TYPEOF(printname) == CHARSXP);
to_str(R_CHAR(printname) as *const u8)
//TODO: Use `RawRobj` when it is a thing
// assert_eq!(TYPEOF(printname), CHARSXP); // printname is always CHARSXP
let length = Rf_xlength(printname);
let all_bytes = std::slice::from_raw_parts(R_CHAR(printname) as _, length as _);
std::str::from_utf8_unchecked(all_bytes)
}
}
}
Expand Down
77 changes: 77 additions & 0 deletions extendr-api/tests/string_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// use std::ffi::CStr;
#[allow(unused_imports)]
use extendr_api::{r, GetSexp, Robj};
use extendr_engine::with_r;
use extendr_macros::R;
#[allow(unused_imports)]
use libR_sys::{
R_BlankScalarString, R_NilValue, R_ParseString, Rf_PrintValue, Rf_xlength, STRING_ELT, TYPEOF,
};
#[allow(unused_imports)]
use libR_sys::{R_BlankString, R_NaString, CHARSXP, STRSXP};

/// This is a "test" to print out all the assumptions on strings we have,
/// from empty, NA, and multi-byte UTF8 characters
#[test]
fn test_byte_length() {
// assuming that for a CHARSXP, the function Rf_xlength
with_r(|| unsafe {
// what is a Scalar NA string?

let na_character_ = R!(r#"NA_character_"#).unwrap();
assert_eq!(TYPEOF(na_character_.get()), STRSXP);
let string_with_na = R!(r#"c(NA_character_)"#).unwrap();
assert_eq!(TYPEOF(string_with_na.get()), STRSXP);
assert_ne!(
string_with_na.get(),
R_NaString,
"a character vector with single NA is not NA"
);
assert_ne!(
string_with_na.get(),
R_BlankScalarString,
"c(NA_character_) is not BlankScalarString"
);
assert_eq!(TYPEOF(STRING_ELT(string_with_na.get(), 0)), CHARSXP);
assert_eq!(
STRING_ELT(string_with_na.get(), 0),
R_NaString,
"the CHARSXP is NA"
);

let r_str_vec = R!(r#"c(NA_character_, "", "1", "123", "✅")"#).unwrap();
Rf_PrintValue(r_str_vec.get());

// let r_str_vec_ref = &r_str_vec;
// Rf_PrintValue(R!("length({{r_str_vec_ref}})").unwrap().get());

dbg!(
TYPEOF(r_str_vec.get()),
Rf_xlength(r_str_vec.get()),
// Rf_length(r_str_vec.get())
);

for i in 0..Rf_xlength(r_str_vec.get()) {
dbg!(i);
let elt = STRING_ELT(r_str_vec.get(), i);
// print!("elt =");
Rf_PrintValue(elt);
dbg!(
TYPEOF(elt),
Rf_xlength(elt),
libR_sys::Rf_length(elt),
elt == libR_sys::R_NaString,
elt == libR_sys::R_BlankString,
elt == libR_sys::R_BlankScalarString,
// same as `Rf_xlength` anyways
// R_nchar(
// elt,
// nchar_type_Bytes,
// libR_sys::Rboolean::FALSE,
// libR_sys::Rboolean::TRUE,
// ptr::null()
// )
);
}
});
}

0 comments on commit 125907c

Please sign in to comment.