Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

isfiletext() checks whether a file is text or binary… #239

Merged
merged 8 commits into from
Dec 21, 2019
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ export(get_ext)
export(import)
export(import_list)
export(install_formats)
export(isfiletext)
export(spread_attrs)
importFrom(curl,curl_fetch_memory)
importFrom(curl,parse_headers)
Expand Down
34 changes: 34 additions & 0 deletions R/isfiletext.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#' Determine whether a file is "plain-text" or some sort of binary format
bokov marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @param filename Path to the file
#' @param maxsize Maximum number of bytes to read
#' @param textbytes Which characters are used by normal (though not necessarily
#' just ASCII) text. To detect just ASCII, the following value
#' can be used: `as.raw(c(7:16,18,19,32:127))`
#' @param tf If `TRUE` (default) simply return `TRUE` when `filename`
bokov marked this conversation as resolved.
Show resolved Hide resolved
#' references a text-only file and `FALSE` otherwise. If set to
#' `FALSE` then returns the "non text" bytes found in the file.
#'
#' @return boolean or raw
#' @export
#' @examples
#' library(datasets)
#' export(iris,"iris.yml")
#' isfiletext("iris.yml")
#' ## TRUE
#'
#' export(iris,"iris.sav")
#' isfiletext("iris.sav")
#' ## FALSE
#' isfiletext("iris.sav", tf=FALSE)
#' ## These are the characters found in "iris.sav" that are not printable text
#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f
bokov marked this conversation as resolved.
Show resolved Hide resolved
isfiletext <- function(filename,maxsize=Inf,
bokov marked this conversation as resolved.
Show resolved Hide resolved
bokov marked this conversation as resolved.
Show resolved Hide resolved
textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)),
bokov marked this conversation as resolved.
Show resolved Hide resolved
tf=TRUE){
bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size,
bokov marked this conversation as resolved.
Show resolved Hide resolved
maxsize));
close(ff);
nontextbytes <- setdiff(bytes,textbytes);
if(tf) return(length(nontextbytes)==0) else return(nontextbytes);
}
45 changes: 45 additions & 0 deletions man/isfiletext.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

71 changes: 71 additions & 0 deletions tests/testthat/test_isfiletext.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
context("correctly identifying files as text vs binary")
bokov marked this conversation as resolved.
Show resolved Hide resolved
require("datasets")

txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt")
binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt")
names(iris) <- gsub("\\.","_",names(iris))

test_that("Required text formats recognized as text", {
for(xx in txtformats) expect_true(isfiletext(export(iris,
bokov marked this conversation as resolved.
Show resolved Hide resolved
paste0("iris.",xx))),
label = paste0(xx," should be text"))
})

test_that("Required non-text formats recognized as non-text", {
for(xx in binformats) expect_false(isfiletext(export(iris,
paste0("iris.",xx))),
label = paste0(xx," should be text"))
})

test_that("csvy recognized as text", {
skip_if_not_installed(pkg="csvy")
expect_true(isfiletext(export(iris,'iris.csvy')))
})

test_that("xml and html recognized as text", {
skip_if_not_installed(pkg="xml2")
expect_true(isfiletext(export(iris,'iris.xml')))
expect_true(isfiletext(export(iris,'iris.html')))
})

test_that("json recognized as text", {
skip_if_not_installed(pkg="jsonlite")
expect_true(isfiletext(export(iris,'iris.json')))
})

test_that("yml recognized as text", {
skip_if_not_installed(pkg="yaml")
expect_true(isfiletext(export(iris,'iris.yml')))
})

test_that("pzfx recognized as text", {
skip_if_not_installed(pkg="pzfx")
expect_true(isfiletext(export(iris[,-5],"iris.pzfx")))
})

# binformats_suggest <- c("matlab","ods","fst","feather")
test_that("matlab recognized as binary", {
skip_if_not_installed(pkg="rmatio")
expect_false(isfiletext(export(iris,'iris.matlab')))
})

test_that("ods recognized as binary", {
skip_if_not_installed(pkg="readODS")
expect_false(isfiletext(export(iris,'iris.ods')))
})

test_that("fst recognized as binary", {
skip_if_not_installed(pkg="fst")
expect_false(isfiletext(export(iris,'iris.fst')))
})

test_that("feather recognized as binary", {
skip_if_not_installed(pkg="feather")
expect_false(isfiletext(export(iris,'iris.feather')))
})

unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json',
'yml','pzfx','matlab','ods',
'fst','feather')))
rm(iris,txtformats,binformats)