From 214a07fe9656f9e67cf46906bae87c40ee41d3e5 Mon Sep 17 00:00:00 2001 From: Alex Bokov Date: Tue, 8 Oct 2019 14:04:24 -0500 Subject: [PATCH 1/4] isfiletext() checks whether a file is text or binary in a cross platform matter, completing ticket #236. This can be useful when a file extension is missing or ambiguous --- NAMESPACE | 1 + R/isfiletext.R | 34 +++++++++++++++ man/isfiletext.Rd | 45 ++++++++++++++++++++ tests/testthat/test_isfiletext.R | 71 ++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) create mode 100644 R/isfiletext.R create mode 100644 man/isfiletext.Rd create mode 100644 tests/testthat/test_isfiletext.R diff --git a/NAMESPACE b/NAMESPACE index 87d7ab1..cf1b00c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,7 @@ export(get_ext) export(import) export(import_list) export(install_formats) +export(isfiletext) export(spread_attrs) importFrom(curl,curl_fetch_memory) importFrom(curl,parse_headers) diff --git a/R/isfiletext.R b/R/isfiletext.R new file mode 100644 index 0000000..9b3dae2 --- /dev/null +++ b/R/isfiletext.R @@ -0,0 +1,34 @@ +#' Determine whether a file is "plain-text" or some sort of binary format +#' +#' @param filename Path to the file +#' @param maxsize Maximum number of bytes to read +#' @param textbytes Which characters are used by normal (though not necessarily +#' just ASCII) text. To detect just ASCII, the following value +#' can be used: `as.raw(c(7:16,18,19,32:127))` +#' @param tf If `TRUE` (default) simply return `TRUE` when `filename` +#' references a text-only file and `FALSE` otherwise. If set to +#' `FALSE` then returns the "non text" bytes found in the file. +#' +#' @return boolean or raw +#' @export +#' @examples +#' library(datasets) +#' export(iris,"iris.yml") +#' isfiletext("iris.yml") +#' ## TRUE +#' +#' export(iris,"iris.sav") +#' isfiletext("iris.sav") +#' ## FALSE +#' isfiletext("iris.sav", tf=FALSE) +#' ## These are the characters found in "iris.sav" that are not printable text +#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f +isfiletext <- function(filename,maxsize=Inf, + textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)), + tf=TRUE){ + bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size, + maxsize)); + close(ff); + nontextbytes <- setdiff(bytes,textbytes); + if(tf) return(length(nontextbytes)==0) else return(nontextbytes); +} \ No newline at end of file diff --git a/man/isfiletext.Rd b/man/isfiletext.Rd new file mode 100644 index 0000000..78aa944 --- /dev/null +++ b/man/isfiletext.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/isfiletext.R +\name{isfiletext} +\alias{isfiletext} +\title{Determine whether a file is "plain-text" or some sort of binary format} +\usage{ +isfiletext( + filename, + maxsize = Inf, + textbytes = as.raw(c(7:16, 18, 19, 32:255)), + tf = TRUE +) +} +\arguments{ +\item{filename}{Path to the file} + +\item{maxsize}{Maximum number of bytes to read} + +\item{textbytes}{Which characters are used by normal (though not necessarily +just ASCII) text. To detect just ASCII, the following value +can be used: `as.raw(c(7:16,18,19,32:127))`} + +\item{tf}{If `TRUE` (default) simply return `TRUE` when `filename` +references a text-only file and `FALSE` otherwise. If set to +`FALSE` then returns the "non text" bytes found in the file.} +} +\value{ +boolean or raw +} +\description{ +Determine whether a file is "plain-text" or some sort of binary format +} +\examples{ +library(datasets) +export(iris,"iris.yml") +isfiletext("iris.yml") +## TRUE + +export(iris,"iris.sav") +isfiletext("iris.sav") +## FALSE +isfiletext("iris.sav", tf=FALSE) +## These are the characters found in "iris.sav" that are not printable text +## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f +} diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R new file mode 100644 index 0000000..ccbb00b --- /dev/null +++ b/tests/testthat/test_isfiletext.R @@ -0,0 +1,71 @@ +context("correctly identifying files as text vs binary") +require("datasets") + +txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt","fwf") +binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt") +names(iris) <- gsub("\\.","_",names(iris)) + +test_that("Required text formats recognized as text", { + for(xx in txtformats) expect_true(isfiletext(export(iris, + paste0("iris.",xx))), + label = paste0(xx," should be text")) + }) + +test_that("Required non-text formats recognized as non-text", { + for(xx in binformats) expect_false(isfiletext(export(iris, + paste0("iris.",xx))), + label = paste0(xx," should be text")) + }) + +test_that("csvy recognized as text", { + skip_if_not_installed(pkg="csvy") + expect_true(isfiletext(export(iris,'iris.csvy'))) + }) + +test_that("xml and html recognized as text", { + skip_if_not_installed(pkg="xml2") + expect_true(isfiletext(export(iris,'iris.xml'))) + expect_true(isfiletext(export(iris,'iris.html'))) + }) + +test_that("json recognized as text", { + skip_if_not_installed(pkg="jsonlite") + expect_true(isfiletext(export(iris,'iris.json'))) + }) + +test_that("yml recognized as text", { + skip_if_not_installed(pkg="yaml") + expect_true(isfiletext(export(iris,'iris.yml'))) + }) + +test_that("pzfx recognized as text", { + skip_if_not_installed(pkg="pzfx") + expect_true(isfiletext(export(iris[,-5],"iris.pzfx"))) + }) + +# binformats_suggest <- c("matlab","ods","fst","feather") +test_that("matlab recognized as binary", { + skip_if_not_installed(pkg="rmatio") + expect_false(isfiletext(export(iris,'iris.matlab'))) +}) + +test_that("ods recognized as binary", { + skip_if_not_installed(pkg="readODS") + expect_false(isfiletext(export(iris,'iris.ods'))) +}) + +test_that("fst recognized as binary", { + skip_if_not_installed(pkg="fst") + expect_false(isfiletext(export(iris,'iris.fst'))) +}) + +test_that("feather recognized as binary", { + skip_if_not_installed(pkg="feather") + expect_false(isfiletext(export(iris,'iris.feather'))) +}) + +unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json', + 'yml','pzfx','matlab','ods', + 'fst','feather'))) +rm(iris,txtformats,binformats) + From 569cd4eaae31f429cd5d4dce9f4294c425e5c48e Mon Sep 17 00:00:00 2001 From: Alex Bokov Date: Wed, 9 Oct 2019 16:30:22 -0500 Subject: [PATCH 2/4] Minor tweak-- removed redundant 'fwf' entry from 'txtformats' in the test for isfiletext() --- tests/testthat/test_isfiletext.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R index ccbb00b..0ea6629 100644 --- a/tests/testthat/test_isfiletext.R +++ b/tests/testthat/test_isfiletext.R @@ -1,7 +1,7 @@ context("correctly identifying files as text vs binary") require("datasets") -txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt","fwf") +txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt") binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt") names(iris) <- gsub("\\.","_",names(iris)) From a3b5af401081999b0668296f556d6bd94c3c21a7 Mon Sep 17 00:00:00 2001 From: Alex Bokov Date: Fri, 20 Dec 2019 13:04:20 -0600 Subject: [PATCH 3/4] Incorporating feedback from #239#pullrequestreview-335144814 and merging in changes to master --- NAMESPACE | 2 +- NEWS.md | 1 + R/isfiletext.R | 57 ++++++++++++---------- man/is_file_text.Rd | 38 +++++++++++++++ man/isfiletext.Rd | 45 ------------------ tests/testthat/test_isfiletext.R | 81 ++++++++++++++++---------------- 6 files changed, 113 insertions(+), 111 deletions(-) create mode 100644 man/is_file_text.Rd delete mode 100644 man/isfiletext.Rd diff --git a/NAMESPACE b/NAMESPACE index 50730ba..57cc8b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,7 +87,7 @@ export(get_ext) export(import) export(import_list) export(install_formats) -export(isfiletext) +export(is_file_text) export(spread_attrs) importFrom(curl,curl_fetch_memory) importFrom(curl,parse_headers) diff --git a/NEWS.md b/NEWS.md index 0b4bb5b..f0c9448 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # rio 0.5.22 * Added an `export_list()` function to write a list of data frames to multiple files using a vector of file names or a file pattern. (#207, h/t Bill Denney) +* Added an `is_file_text()` function to determine whether a file is in a plain-text format. Optionally narrower subsets of characters can be specified, e.g. ASCII. (#236 Alex Bokov) # rio 0.5.21 diff --git a/R/isfiletext.R b/R/isfiletext.R index 9b3dae2..efe6bb1 100644 --- a/R/isfiletext.R +++ b/R/isfiletext.R @@ -1,34 +1,41 @@ #' Determine whether a file is "plain-text" or some sort of binary format +#' +#' For debugging (i.e. why is a file being deemed non-text) set the option +#' \code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will +#' also have an attribute named \code{"non_text_bytes"} which will be a vector +#' of the raw bytes which prevented the file from being identified as text +#' +#' @param file Path to the file +#' @param maxsize Maximum number of bytes to read +#' @param text_bytes Which characters are used by normal text (though not +#' necessarily just ASCII). To detect just ASCII, the +#' following value can be used: +#' \code{as.raw(c(7:16, 18, 19, 32:127))} #' -#' @param filename Path to the file -#' @param maxsize Maximum number of bytes to read -#' @param textbytes Which characters are used by normal (though not necessarily -#' just ASCII) text. To detect just ASCII, the following value -#' can be used: `as.raw(c(7:16,18,19,32:127))` -#' @param tf If `TRUE` (default) simply return `TRUE` when `filename` -#' references a text-only file and `FALSE` otherwise. If set to -#' `FALSE` then returns the "non text" bytes found in the file. -#' -#' @return boolean or raw +#' @return boolean #' @export #' @examples #' library(datasets) -#' export(iris,"iris.yml") -#' isfiletext("iris.yml") +#' export(iris, "iris.yml") +#' is_file_text("iris.yml") #' ## TRUE #' -#' export(iris,"iris.sav") -#' isfiletext("iris.sav") +#' export(iris, "iris.sav") +#' is_file_text("iris.sav") #' ## FALSE -#' isfiletext("iris.sav", tf=FALSE) -#' ## These are the characters found in "iris.sav" that are not printable text -#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f -isfiletext <- function(filename,maxsize=Inf, - textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)), - tf=TRUE){ - bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size, - maxsize)); - close(ff); - nontextbytes <- setdiff(bytes,textbytes); - if(tf) return(length(nontextbytes)==0) else return(nontextbytes); +#' +is_file_text <- function(file, maxsize = Inf, + text_bytes = as.raw(c(0x7:0x10, 0x12, 0x13, 0x20:0xFF))) { + + bytes <- readBin(ff <- file(file, "rb"), raw(), + n = min(file.info(file)$size, maxsize)) + close(ff) + + non_text_bytes <- setdiff(bytes, text_bytes) + result <- length(non_text_bytes) == 0 + if (getOption("rio.is_file_text.debug", FALSE)) { + attr(result, "non_text_bytes") <- non_text_bytes + } + + return(result) } \ No newline at end of file diff --git a/man/is_file_text.Rd b/man/is_file_text.Rd new file mode 100644 index 0000000..66b4271 --- /dev/null +++ b/man/is_file_text.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/isfiletext.R +\name{is_file_text} +\alias{is_file_text} +\title{Determine whether a file is "plain-text" or some sort of binary format} +\usage{ +is_file_text(file, maxsize = Inf, text_bytes = as.raw(c(7:16, 18, 19, 32:255))) +} +\arguments{ +\item{file}{Path to the file} + +\item{maxsize}{Maximum number of bytes to read} + +\item{text_bytes}{Which characters are used by normal text (though not +necessarily just ASCII). To detect just ASCII, the +following value can be used: +\code{as.raw(c(7:16, 18, 19, 32:127))}} +} +\value{ +boolean +} +\description{ +For debugging (i.e. why is a file being deemed non-text) set the option +\code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will +also have an attribute named \code{"non_text_bytes"} which will be a vector +of the raw bytes which prevented the file from being identified as text +} +\examples{ +library(datasets) +export(iris, "iris.yml") +is_file_text("iris.yml") +## TRUE + +export(iris, "iris.sav") +is_file_text("iris.sav") +## FALSE + +} diff --git a/man/isfiletext.Rd b/man/isfiletext.Rd deleted file mode 100644 index 78aa944..0000000 --- a/man/isfiletext.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/isfiletext.R -\name{isfiletext} -\alias{isfiletext} -\title{Determine whether a file is "plain-text" or some sort of binary format} -\usage{ -isfiletext( - filename, - maxsize = Inf, - textbytes = as.raw(c(7:16, 18, 19, 32:255)), - tf = TRUE -) -} -\arguments{ -\item{filename}{Path to the file} - -\item{maxsize}{Maximum number of bytes to read} - -\item{textbytes}{Which characters are used by normal (though not necessarily -just ASCII) text. To detect just ASCII, the following value -can be used: `as.raw(c(7:16,18,19,32:127))`} - -\item{tf}{If `TRUE` (default) simply return `TRUE` when `filename` -references a text-only file and `FALSE` otherwise. If set to -`FALSE` then returns the "non text" bytes found in the file.} -} -\value{ -boolean or raw -} -\description{ -Determine whether a file is "plain-text" or some sort of binary format -} -\examples{ -library(datasets) -export(iris,"iris.yml") -isfiletext("iris.yml") -## TRUE - -export(iris,"iris.sav") -isfiletext("iris.sav") -## FALSE -isfiletext("iris.sav", tf=FALSE) -## These are the characters found in "iris.sav" that are not printable text -## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f -} diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R index 0ea6629..5a4d4ac 100644 --- a/tests/testthat/test_isfiletext.R +++ b/tests/testthat/test_isfiletext.R @@ -1,71 +1,72 @@ context("correctly identifying files as text vs binary") require("datasets") -txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt") -binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt") -names(iris) <- gsub("\\.","_",names(iris)) +txtformats <- c("arff", "csv", "csv2", "dump", "fwf", "psv", "r", "tsv", "txt") +binformats <- c("dbf", "dta", "rda", "rdata", "rds", "sas7bdat", "sav", "xlsx", + "xpt") +names(iris) <- gsub("\\.", "_", names(iris)) test_that("Required text formats recognized as text", { - for(xx in txtformats) expect_true(isfiletext(export(iris, - paste0("iris.",xx))), - label = paste0(xx," should be text")) - }) + for (xx in txtformats) { + expect_true(is_file_text(export(iris, paste0("iris.", xx))), + label = paste0(xx, " should be text")) + } +}) test_that("Required non-text formats recognized as non-text", { - for(xx in binformats) expect_false(isfiletext(export(iris, - paste0("iris.",xx))), - label = paste0(xx," should be text")) - }) + for (xx in binformats) { + expect_false(is_file_text(export(iris, paste0("iris.", xx))), + label = paste0(xx, " should not be text")) + } +}) test_that("csvy recognized as text", { - skip_if_not_installed(pkg="csvy") - expect_true(isfiletext(export(iris,'iris.csvy'))) - }) + skip_if_not_installed(pkg = "csvy") + expect_true(is_file_text(export(iris, "iris.csvy"))) +}) test_that("xml and html recognized as text", { - skip_if_not_installed(pkg="xml2") - expect_true(isfiletext(export(iris,'iris.xml'))) - expect_true(isfiletext(export(iris,'iris.html'))) - }) + skip_if_not_installed(pkg = "xml2") + expect_true(is_file_text(export(iris, "iris.xml"))) + expect_true(is_file_text(export(iris, "iris.html"))) +}) test_that("json recognized as text", { - skip_if_not_installed(pkg="jsonlite") - expect_true(isfiletext(export(iris,'iris.json'))) - }) + skip_if_not_installed(pkg = "jsonlite") + expect_true(is_file_text(export(iris, "iris.json"))) +}) test_that("yml recognized as text", { - skip_if_not_installed(pkg="yaml") - expect_true(isfiletext(export(iris,'iris.yml'))) - }) + skip_if_not_installed(pkg = "yaml") + expect_true(is_file_text(export(iris, "iris.yml"))) +}) test_that("pzfx recognized as text", { - skip_if_not_installed(pkg="pzfx") - expect_true(isfiletext(export(iris[,-5],"iris.pzfx"))) - }) + skip_if_not_installed(pkg = "pzfx") + expect_true(is_file_text(export(iris[,-5], "iris.pzfx"))) +}) -# binformats_suggest <- c("matlab","ods","fst","feather") test_that("matlab recognized as binary", { - skip_if_not_installed(pkg="rmatio") - expect_false(isfiletext(export(iris,'iris.matlab'))) + skip_if_not_installed(pkg = "rmatio") + expect_false(is_file_text(export(iris, "iris.matlab"))) }) test_that("ods recognized as binary", { - skip_if_not_installed(pkg="readODS") - expect_false(isfiletext(export(iris,'iris.ods'))) + skip_if_not_installed(pkg = "readODS") + expect_false(is_file_text(export(iris, "iris.ods"))) }) test_that("fst recognized as binary", { - skip_if_not_installed(pkg="fst") - expect_false(isfiletext(export(iris,'iris.fst'))) + skip_if_not_installed(pkg = "fst") + expect_false(is_file_text(export(iris, "iris.fst"))) }) test_that("feather recognized as binary", { - skip_if_not_installed(pkg="feather") - expect_false(isfiletext(export(iris,'iris.feather'))) + skip_if_not_installed(pkg = "feather") + expect_false(is_file_text(export(iris, "iris.feather"))) }) -unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json', - 'yml','pzfx','matlab','ods', - 'fst','feather'))) -rm(iris,txtformats,binformats) +unlink(paste0("iris.", c(txtformats, binformats, "csvy", "xml", "html", "json", + "yml", "pzfx", "matlab", "ods", "fst", "feather"))) +rm(iris, txtformats, binformats) From 61e6b52ac4bcce068d23a588ec7390b313f701b5 Mon Sep 17 00:00:00 2001 From: "Alex F. Bokov, Ph.D" Date: Fri, 20 Dec 2019 20:13:29 -0600 Subject: [PATCH 4/4] Incorporating review feedback --- R/isfiletext.R | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/R/isfiletext.R b/R/isfiletext.R index efe6bb1..a845474 100644 --- a/R/isfiletext.R +++ b/R/isfiletext.R @@ -1,9 +1,5 @@ -#' Determine whether a file is "plain-text" or some sort of binary format +#' @title Determine whether a file is "plain-text" or some sort of binary format #' -#' For debugging (i.e. why is a file being deemed non-text) set the option -#' \code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will -#' also have an attribute named \code{"non_text_bytes"} which will be a vector -#' of the raw bytes which prevented the file from being identified as text #' #' @param file Path to the file #' @param maxsize Maximum number of bytes to read @@ -12,7 +8,7 @@ #' following value can be used: #' \code{as.raw(c(7:16, 18, 19, 32:127))} #' -#' @return boolean +#' @return A logical #' @export #' @examples #' library(datasets) @@ -31,11 +27,5 @@ is_file_text <- function(file, maxsize = Inf, n = min(file.info(file)$size, maxsize)) close(ff) - non_text_bytes <- setdiff(bytes, text_bytes) - result <- length(non_text_bytes) == 0 - if (getOption("rio.is_file_text.debug", FALSE)) { - attr(result, "non_text_bytes") <- non_text_bytes - } - - return(result) -} \ No newline at end of file + return(length(setdiff(bytes, text_bytes)) == 0) +}