Skip to content

Commit

Permalink
is_file_text() checks whether a file is text or binary… (#239)
Browse files Browse the repository at this point in the history
* isfiletext() checks whether a file is text or binary in a cross platform matter, completing ticket #236. This can be useful when a file extension is missing or ambiguous

* Minor tweak-- removed redundant 'fwf' entry from 'txtformats' in the test for isfiletext()

* Incorporating feedback from #239#pullrequestreview-335144814 and merging in changes to master

* Incorporating review feedback
  • Loading branch information
bokov authored and leeper committed Dec 21, 2019
1 parent 35755f0 commit ffc7c80
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ export(get_ext)
export(import)
export(import_list)
export(install_formats)
export(is_file_text)
export(spread_attrs)
importFrom(curl,curl_fetch_memory)
importFrom(curl,parse_headers)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# rio 0.5.22

* Added an `export_list()` function to write a list of data frames to multiple files using a vector of file names or a file pattern. (#207, h/t Bill Denney)
* Added an `is_file_text()` function to determine whether a file is in a plain-text format. Optionally narrower subsets of characters can be specified, e.g. ASCII. (#236 Alex Bokov)

# rio 0.5.21

Expand Down
31 changes: 31 additions & 0 deletions R/isfiletext.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' @title Determine whether a file is "plain-text" or some sort of binary format
#'
#'
#' @param file Path to the file
#' @param maxsize Maximum number of bytes to read
#' @param text_bytes Which characters are used by normal text (though not
#' necessarily just ASCII). To detect just ASCII, the
#' following value can be used:
#' \code{as.raw(c(7:16, 18, 19, 32:127))}
#'
#' @return A logical
#' @export
#' @examples
#' library(datasets)
#' export(iris, "iris.yml")
#' is_file_text("iris.yml")
#' ## TRUE
#'
#' export(iris, "iris.sav")
#' is_file_text("iris.sav")
#' ## FALSE
#'
is_file_text <- function(file, maxsize = Inf,
text_bytes = as.raw(c(0x7:0x10, 0x12, 0x13, 0x20:0xFF))) {

bytes <- readBin(ff <- file(file, "rb"), raw(),
n = min(file.info(file)$size, maxsize))
close(ff)

return(length(setdiff(bytes, text_bytes)) == 0)
}
38 changes: 38 additions & 0 deletions man/is_file_text.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 72 additions & 0 deletions tests/testthat/test_isfiletext.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
context("correctly identifying files as text vs binary")
require("datasets")

txtformats <- c("arff", "csv", "csv2", "dump", "fwf", "psv", "r", "tsv", "txt")
binformats <- c("dbf", "dta", "rda", "rdata", "rds", "sas7bdat", "sav", "xlsx",
"xpt")
names(iris) <- gsub("\\.", "_", names(iris))

test_that("Required text formats recognized as text", {
for (xx in txtformats) {
expect_true(is_file_text(export(iris, paste0("iris.", xx))),
label = paste0(xx, " should be text"))
}
})

test_that("Required non-text formats recognized as non-text", {
for (xx in binformats) {
expect_false(is_file_text(export(iris, paste0("iris.", xx))),
label = paste0(xx, " should not be text"))
}
})

test_that("csvy recognized as text", {
skip_if_not_installed(pkg = "csvy")
expect_true(is_file_text(export(iris, "iris.csvy")))
})

test_that("xml and html recognized as text", {
skip_if_not_installed(pkg = "xml2")
expect_true(is_file_text(export(iris, "iris.xml")))
expect_true(is_file_text(export(iris, "iris.html")))
})

test_that("json recognized as text", {
skip_if_not_installed(pkg = "jsonlite")
expect_true(is_file_text(export(iris, "iris.json")))
})

test_that("yml recognized as text", {
skip_if_not_installed(pkg = "yaml")
expect_true(is_file_text(export(iris, "iris.yml")))
})

test_that("pzfx recognized as text", {
skip_if_not_installed(pkg = "pzfx")
expect_true(is_file_text(export(iris[,-5], "iris.pzfx")))
})

test_that("matlab recognized as binary", {
skip_if_not_installed(pkg = "rmatio")
expect_false(is_file_text(export(iris, "iris.matlab")))
})

test_that("ods recognized as binary", {
skip_if_not_installed(pkg = "readODS")
expect_false(is_file_text(export(iris, "iris.ods")))
})

test_that("fst recognized as binary", {
skip_if_not_installed(pkg = "fst")
expect_false(is_file_text(export(iris, "iris.fst")))
})

test_that("feather recognized as binary", {
skip_if_not_installed(pkg = "feather")
expect_false(is_file_text(export(iris, "iris.feather")))
})

unlink(paste0("iris.", c(txtformats, binformats, "csvy", "xml", "html", "json",
"yml", "pzfx", "matlab", "ods", "fst", "feather")))
rm(iris, txtformats, binformats)

0 comments on commit ffc7c80

Please sign in to comment.