gesistsa · leeper · Dec 21, 2019 · Oct 8, 2019 · Oct 9, 2019 · Nov 7, 2019
diff --git a/NAMESPACE b/NAMESPACE
@@ -87,6 +87,7 @@ export(get_ext)
 export(import)
 export(import_list)
 export(install_formats)
+export(is_file_text)
 export(spread_attrs)
 importFrom(curl,curl_fetch_memory)
 importFrom(curl,parse_headers)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # rio 0.5.22
 
 * Added an `export_list()` function to write a list of data frames to multiple files using a vector of file names or a file pattern. (#207, h/t Bill Denney)
+* Added an `is_file_text()` function to determine whether a file is in a plain-text format. Optionally narrower subsets of characters can be specified, e.g. ASCII. (#236 Alex Bokov)
 
 # rio 0.5.21
 

diff --git a/R/isfiletext.R b/R/isfiletext.R
@@ -0,0 +1,41 @@
+#' Determine whether a file is "plain-text" or some sort of binary format
+#' 
+#' For debugging (i.e. why is a file being deemed non-text) set the option
+#' \code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will 
+#' also have an attribute named \code{"non_text_bytes"} which will be a vector 
+#' of the raw bytes which prevented the file from being identified as text
+#' 
+#' @param file       Path to the file
+#' @param maxsize    Maximum number of bytes to read
+#' @param text_bytes Which characters are used by normal text (though not 
+#'                   necessarily just ASCII). To detect just ASCII, the 
+#'                   following value can be used: 
+#'                   \code{as.raw(c(7:16, 18, 19, 32:127))}
+#'
+#' @return boolean
+#' @export
+#' @examples
+#' library(datasets)
+#' export(iris, "iris.yml")
+#' is_file_text("iris.yml")
+#' ## TRUE
+#' 
+#' export(iris, "iris.sav")
+#' is_file_text("iris.sav")
+#' ## FALSE
+#' 
+is_file_text <- function(file, maxsize = Inf, 
+                       text_bytes = as.raw(c(0x7:0x10, 0x12, 0x13, 0x20:0xFF))) {
+
+  bytes <- readBin(ff <- file(file, "rb"), raw(), 
+                   n = min(file.info(file)$size, maxsize))
+  close(ff)
+
+  non_text_bytes <- setdiff(bytes, text_bytes)
+  result <- length(non_text_bytes) == 0
+  if (getOption("rio.is_file_text.debug", FALSE)) {
+    attr(result, "non_text_bytes") <- non_text_bytes
+  }
+
+  return(result)
+}
diff --git a/man/is_file_text.Rd b/man/is_file_text.Rd
diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R
@@ -0,0 +1,72 @@
+context("correctly identifying files as text vs binary")
+require("datasets")
+
+txtformats <- c("arff", "csv", "csv2", "dump", "fwf", "psv", "r", "tsv", "txt")
+binformats <- c("dbf", "dta", "rda", "rdata", "rds", "sas7bdat", "sav", "xlsx", 
+                "xpt")
+names(iris) <- gsub("\\.", "_", names(iris))
+
+test_that("Required text formats recognized as text", {
+  for (xx in txtformats) {
+    expect_true(is_file_text(export(iris, paste0("iris.", xx))), 
+                label = paste0(xx, " should be text"))
+  }
+})
+
+test_that("Required non-text formats recognized as non-text", {
+  for (xx in binformats) {
+    expect_false(is_file_text(export(iris, paste0("iris.", xx))), 
+                 label = paste0(xx, " should not be text"))
+  }
+})
+
+test_that("csvy recognized as text", {
+  skip_if_not_installed(pkg = "csvy")
+  expect_true(is_file_text(export(iris, "iris.csvy")))
+})
+
+test_that("xml and html recognized as text", {
+  skip_if_not_installed(pkg = "xml2")
+  expect_true(is_file_text(export(iris, "iris.xml")))
+  expect_true(is_file_text(export(iris, "iris.html")))
+})
+
+test_that("json recognized as text", {
+  skip_if_not_installed(pkg = "jsonlite")
+  expect_true(is_file_text(export(iris, "iris.json")))
+})
+
+test_that("yml recognized as text", {
+  skip_if_not_installed(pkg = "yaml")
+  expect_true(is_file_text(export(iris, "iris.yml")))
+})
+
+test_that("pzfx recognized as text", {
+  skip_if_not_installed(pkg = "pzfx")
+  expect_true(is_file_text(export(iris[,-5], "iris.pzfx")))
+})
+
+test_that("matlab recognized as binary", {
+  skip_if_not_installed(pkg = "rmatio")
+  expect_false(is_file_text(export(iris, "iris.matlab")))
+})
+
+test_that("ods recognized as binary", {
+  skip_if_not_installed(pkg = "readODS")
+  expect_false(is_file_text(export(iris, "iris.ods")))
+})
+
+test_that("fst recognized as binary", {
+  skip_if_not_installed(pkg = "fst")
+  expect_false(is_file_text(export(iris, "iris.fst")))
+})
+
+test_that("feather recognized as binary", {
+  skip_if_not_installed(pkg = "feather")
+  expect_false(is_file_text(export(iris, "iris.feather")))
+})
+
+unlink(paste0("iris.", c(txtformats, binformats, "csvy", "xml", "html", "json", 
+                         "yml", "pzfx", "matlab", "ods", "fst", "feather")))
+rm(iris, txtformats, binformats)
+