diff --git a/NAMESPACE b/NAMESPACE index 0083e18..57cc8b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,6 +87,7 @@ export(get_ext) export(import) export(import_list) export(install_formats) +export(is_file_text) export(spread_attrs) importFrom(curl,curl_fetch_memory) importFrom(curl,parse_headers) diff --git a/NEWS.md b/NEWS.md index 0b4bb5b..f0c9448 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # rio 0.5.22 * Added an `export_list()` function to write a list of data frames to multiple files using a vector of file names or a file pattern. (#207, h/t Bill Denney) +* Added an `is_file_text()` function to determine whether a file is in a plain-text format. Optionally narrower subsets of characters can be specified, e.g. ASCII. (#236 Alex Bokov) # rio 0.5.21 diff --git a/R/isfiletext.R b/R/isfiletext.R new file mode 100644 index 0000000..a845474 --- /dev/null +++ b/R/isfiletext.R @@ -0,0 +1,31 @@ +#' @title Determine whether a file is "plain-text" or some sort of binary format +#' +#' +#' @param file Path to the file +#' @param maxsize Maximum number of bytes to read +#' @param text_bytes Which characters are used by normal text (though not +#' necessarily just ASCII). To detect just ASCII, the +#' following value can be used: +#' \code{as.raw(c(7:16, 18, 19, 32:127))} +#' +#' @return A logical +#' @export +#' @examples +#' library(datasets) +#' export(iris, "iris.yml") +#' is_file_text("iris.yml") +#' ## TRUE +#' +#' export(iris, "iris.sav") +#' is_file_text("iris.sav") +#' ## FALSE +#' +is_file_text <- function(file, maxsize = Inf, + text_bytes = as.raw(c(0x7:0x10, 0x12, 0x13, 0x20:0xFF))) { + + bytes <- readBin(ff <- file(file, "rb"), raw(), + n = min(file.info(file)$size, maxsize)) + close(ff) + + return(length(setdiff(bytes, text_bytes)) == 0) +} diff --git a/man/is_file_text.Rd b/man/is_file_text.Rd new file mode 100644 index 0000000..66b4271 --- /dev/null +++ b/man/is_file_text.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/isfiletext.R +\name{is_file_text} +\alias{is_file_text} +\title{Determine whether a file is "plain-text" or some sort of binary format} +\usage{ +is_file_text(file, maxsize = Inf, text_bytes = as.raw(c(7:16, 18, 19, 32:255))) +} +\arguments{ +\item{file}{Path to the file} + +\item{maxsize}{Maximum number of bytes to read} + +\item{text_bytes}{Which characters are used by normal text (though not +necessarily just ASCII). To detect just ASCII, the +following value can be used: +\code{as.raw(c(7:16, 18, 19, 32:127))}} +} +\value{ +boolean +} +\description{ +For debugging (i.e. why is a file being deemed non-text) set the option +\code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will +also have an attribute named \code{"non_text_bytes"} which will be a vector +of the raw bytes which prevented the file from being identified as text +} +\examples{ +library(datasets) +export(iris, "iris.yml") +is_file_text("iris.yml") +## TRUE + +export(iris, "iris.sav") +is_file_text("iris.sav") +## FALSE + +} diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R new file mode 100644 index 0000000..5a4d4ac --- /dev/null +++ b/tests/testthat/test_isfiletext.R @@ -0,0 +1,72 @@ +context("correctly identifying files as text vs binary") +require("datasets") + +txtformats <- c("arff", "csv", "csv2", "dump", "fwf", "psv", "r", "tsv", "txt") +binformats <- c("dbf", "dta", "rda", "rdata", "rds", "sas7bdat", "sav", "xlsx", + "xpt") +names(iris) <- gsub("\\.", "_", names(iris)) + +test_that("Required text formats recognized as text", { + for (xx in txtformats) { + expect_true(is_file_text(export(iris, paste0("iris.", xx))), + label = paste0(xx, " should be text")) + } +}) + +test_that("Required non-text formats recognized as non-text", { + for (xx in binformats) { + expect_false(is_file_text(export(iris, paste0("iris.", xx))), + label = paste0(xx, " should not be text")) + } +}) + +test_that("csvy recognized as text", { + skip_if_not_installed(pkg = "csvy") + expect_true(is_file_text(export(iris, "iris.csvy"))) +}) + +test_that("xml and html recognized as text", { + skip_if_not_installed(pkg = "xml2") + expect_true(is_file_text(export(iris, "iris.xml"))) + expect_true(is_file_text(export(iris, "iris.html"))) +}) + +test_that("json recognized as text", { + skip_if_not_installed(pkg = "jsonlite") + expect_true(is_file_text(export(iris, "iris.json"))) +}) + +test_that("yml recognized as text", { + skip_if_not_installed(pkg = "yaml") + expect_true(is_file_text(export(iris, "iris.yml"))) +}) + +test_that("pzfx recognized as text", { + skip_if_not_installed(pkg = "pzfx") + expect_true(is_file_text(export(iris[,-5], "iris.pzfx"))) +}) + +test_that("matlab recognized as binary", { + skip_if_not_installed(pkg = "rmatio") + expect_false(is_file_text(export(iris, "iris.matlab"))) +}) + +test_that("ods recognized as binary", { + skip_if_not_installed(pkg = "readODS") + expect_false(is_file_text(export(iris, "iris.ods"))) +}) + +test_that("fst recognized as binary", { + skip_if_not_installed(pkg = "fst") + expect_false(is_file_text(export(iris, "iris.fst"))) +}) + +test_that("feather recognized as binary", { + skip_if_not_installed(pkg = "feather") + expect_false(is_file_text(export(iris, "iris.feather"))) +}) + +unlink(paste0("iris.", c(txtformats, binformats, "csvy", "xml", "html", "json", + "yml", "pzfx", "matlab", "ods", "fst", "feather"))) +rm(iris, txtformats, binformats) +