From 214a07fe9656f9e67cf46906bae87c40ee41d3e5 Mon Sep 17 00:00:00 2001
From: Alex Bokov <bokov@uthscsa.edu>
Date: Tue, 8 Oct 2019 14:04:24 -0500
Subject: [PATCH 1/4] isfiletext() checks whether a file is text or binary in a
 cross platform matter, completing ticket #236. This can be useful when a file
 extension is missing or ambiguous

---
 NAMESPACE                        |  1 +
 R/isfiletext.R                   | 34 +++++++++++++++
 man/isfiletext.Rd                | 45 ++++++++++++++++++++
 tests/testthat/test_isfiletext.R | 71 ++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 R/isfiletext.R
 create mode 100644 man/isfiletext.Rd
 create mode 100644 tests/testthat/test_isfiletext.R

diff --git a/NAMESPACE b/NAMESPACE
index 87d7ab1..cf1b00c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -83,6 +83,7 @@ export(get_ext)
 export(import)
 export(import_list)
 export(install_formats)
+export(isfiletext)
 export(spread_attrs)
 importFrom(curl,curl_fetch_memory)
 importFrom(curl,parse_headers)
diff --git a/R/isfiletext.R b/R/isfiletext.R
new file mode 100644
index 0000000..9b3dae2
--- /dev/null
+++ b/R/isfiletext.R
@@ -0,0 +1,34 @@
+#' Determine whether a file is "plain-text" or some sort of binary format
+#'
+#' @param filename Path to the file
+#' @param maxsize  Maximum number of bytes to read
+#' @param textbytes Which characters are used by normal (though not necessarily 
+#'                  just ASCII) text. To detect just ASCII, the following value
+#'                  can be used: `as.raw(c(7:16,18,19,32:127))`
+#' @param tf       If `TRUE` (default) simply return `TRUE` when `filename` 
+#'                 references a text-only file and `FALSE` otherwise. If set to
+#'                 `FALSE` then returns the "non text" bytes found in the file.
+#'
+#' @return boolean or raw
+#' @export
+#' @examples
+#' library(datasets)
+#' export(iris,"iris.yml")
+#' isfiletext("iris.yml")
+#' ## TRUE
+#' 
+#' export(iris,"iris.sav")
+#' isfiletext("iris.sav")
+#' ## FALSE
+#' isfiletext("iris.sav", tf=FALSE)
+#' ## These are the characters found in "iris.sav" that are not printable text
+#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f
+isfiletext <- function(filename,maxsize=Inf,
+                       textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)),
+                       tf=TRUE){
+  bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size,
+                                                       maxsize));
+  close(ff);
+  nontextbytes <- setdiff(bytes,textbytes);
+  if(tf) return(length(nontextbytes)==0) else return(nontextbytes);
+}
\ No newline at end of file
diff --git a/man/isfiletext.Rd b/man/isfiletext.Rd
new file mode 100644
index 0000000..78aa944
--- /dev/null
+++ b/man/isfiletext.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/isfiletext.R
+\name{isfiletext}
+\alias{isfiletext}
+\title{Determine whether a file is "plain-text" or some sort of binary format}
+\usage{
+isfiletext(
+  filename,
+  maxsize = Inf,
+  textbytes = as.raw(c(7:16, 18, 19, 32:255)),
+  tf = TRUE
+)
+}
+\arguments{
+\item{filename}{Path to the file}
+
+\item{maxsize}{Maximum number of bytes to read}
+
+\item{textbytes}{Which characters are used by normal (though not necessarily 
+just ASCII) text. To detect just ASCII, the following value
+can be used: `as.raw(c(7:16,18,19,32:127))`}
+
+\item{tf}{If `TRUE` (default) simply return `TRUE` when `filename` 
+references a text-only file and `FALSE` otherwise. If set to
+`FALSE` then returns the "non text" bytes found in the file.}
+}
+\value{
+boolean or raw
+}
+\description{
+Determine whether a file is "plain-text" or some sort of binary format
+}
+\examples{
+library(datasets)
+export(iris,"iris.yml")
+isfiletext("iris.yml")
+## TRUE
+
+export(iris,"iris.sav")
+isfiletext("iris.sav")
+## FALSE
+isfiletext("iris.sav", tf=FALSE)
+## These are the characters found in "iris.sav" that are not printable text
+## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f
+}
diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R
new file mode 100644
index 0000000..ccbb00b
--- /dev/null
+++ b/tests/testthat/test_isfiletext.R
@@ -0,0 +1,71 @@
+context("correctly identifying files as text vs binary")
+require("datasets")
+
+txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt","fwf")
+binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt")
+names(iris) <- gsub("\\.","_",names(iris))
+
+test_that("Required text formats recognized as text", {
+  for(xx in txtformats) expect_true(isfiletext(export(iris,
+                                                      paste0("iris.",xx))),
+                                    label = paste0(xx," should be text"))
+  })
+
+test_that("Required non-text formats recognized as non-text", {
+  for(xx in binformats) expect_false(isfiletext(export(iris,
+                                                       paste0("iris.",xx))),
+                                     label = paste0(xx," should be text"))
+  })
+
+test_that("csvy recognized as text", {
+  skip_if_not_installed(pkg="csvy")
+  expect_true(isfiletext(export(iris,'iris.csvy')))
+  })
+
+test_that("xml and html recognized as text", {
+  skip_if_not_installed(pkg="xml2")
+  expect_true(isfiletext(export(iris,'iris.xml')))
+  expect_true(isfiletext(export(iris,'iris.html')))
+  })
+
+test_that("json recognized as text", {
+  skip_if_not_installed(pkg="jsonlite")
+  expect_true(isfiletext(export(iris,'iris.json')))
+  })
+
+test_that("yml recognized as text", {
+  skip_if_not_installed(pkg="yaml")
+  expect_true(isfiletext(export(iris,'iris.yml')))
+  })
+
+test_that("pzfx recognized as text", {
+  skip_if_not_installed(pkg="pzfx")
+  expect_true(isfiletext(export(iris[,-5],"iris.pzfx")))
+  })
+
+# binformats_suggest <- c("matlab","ods","fst","feather")
+test_that("matlab recognized as binary", {
+  skip_if_not_installed(pkg="rmatio")
+  expect_false(isfiletext(export(iris,'iris.matlab')))
+})
+
+test_that("ods recognized as binary", {
+  skip_if_not_installed(pkg="readODS")
+  expect_false(isfiletext(export(iris,'iris.ods')))
+})
+
+test_that("fst recognized as binary", {
+  skip_if_not_installed(pkg="fst")
+  expect_false(isfiletext(export(iris,'iris.fst')))
+})
+
+test_that("feather recognized as binary", {
+  skip_if_not_installed(pkg="feather")
+  expect_false(isfiletext(export(iris,'iris.feather')))
+})
+
+unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json',
+                        'yml','pzfx','matlab','ods',
+                        'fst','feather')))
+rm(iris,txtformats,binformats)
+

From 569cd4eaae31f429cd5d4dce9f4294c425e5c48e Mon Sep 17 00:00:00 2001
From: Alex Bokov <bokov@uthscsa.edu>
Date: Wed, 9 Oct 2019 16:30:22 -0500
Subject: [PATCH 2/4] Minor tweak-- removed redundant 'fwf' entry from
 'txtformats' in the test for isfiletext()

---
 tests/testthat/test_isfiletext.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R
index ccbb00b..0ea6629 100644
--- a/tests/testthat/test_isfiletext.R
+++ b/tests/testthat/test_isfiletext.R
@@ -1,7 +1,7 @@
 context("correctly identifying files as text vs binary")
 require("datasets")
 
-txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt","fwf")
+txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt")
 binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt")
 names(iris) <- gsub("\\.","_",names(iris))
 

From a3b5af401081999b0668296f556d6bd94c3c21a7 Mon Sep 17 00:00:00 2001
From: Alex Bokov <bokov@uthscsa.edu>
Date: Fri, 20 Dec 2019 13:04:20 -0600
Subject: [PATCH 3/4] Incorporating feedback from
 #239#pullrequestreview-335144814 and merging in changes to master

---
 NAMESPACE                        |  2 +-
 NEWS.md                          |  1 +
 R/isfiletext.R                   | 57 ++++++++++++----------
 man/is_file_text.Rd              | 38 +++++++++++++++
 man/isfiletext.Rd                | 45 ------------------
 tests/testthat/test_isfiletext.R | 81 ++++++++++++++++----------------
 6 files changed, 113 insertions(+), 111 deletions(-)
 create mode 100644 man/is_file_text.Rd
 delete mode 100644 man/isfiletext.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 50730ba..57cc8b8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -87,7 +87,7 @@ export(get_ext)
 export(import)
 export(import_list)
 export(install_formats)
-export(isfiletext)
+export(is_file_text)
 export(spread_attrs)
 importFrom(curl,curl_fetch_memory)
 importFrom(curl,parse_headers)
diff --git a/NEWS.md b/NEWS.md
index 0b4bb5b..f0c9448 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
 # rio 0.5.22
 
 * Added an `export_list()` function to write a list of data frames to multiple files using a vector of file names or a file pattern. (#207, h/t Bill Denney)
+* Added an `is_file_text()` function to determine whether a file is in a plain-text format. Optionally narrower subsets of characters can be specified, e.g. ASCII. (#236 Alex Bokov)
 
 # rio 0.5.21
 
diff --git a/R/isfiletext.R b/R/isfiletext.R
index 9b3dae2..efe6bb1 100644
--- a/R/isfiletext.R
+++ b/R/isfiletext.R
@@ -1,34 +1,41 @@
 #' Determine whether a file is "plain-text" or some sort of binary format
+#' 
+#' For debugging (i.e. why is a file being deemed non-text) set the option
+#' \code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will 
+#' also have an attribute named \code{"non_text_bytes"} which will be a vector 
+#' of the raw bytes which prevented the file from being identified as text
+#' 
+#' @param file       Path to the file
+#' @param maxsize    Maximum number of bytes to read
+#' @param text_bytes Which characters are used by normal text (though not 
+#'                   necessarily just ASCII). To detect just ASCII, the 
+#'                   following value can be used: 
+#'                   \code{as.raw(c(7:16, 18, 19, 32:127))}
 #'
-#' @param filename Path to the file
-#' @param maxsize  Maximum number of bytes to read
-#' @param textbytes Which characters are used by normal (though not necessarily 
-#'                  just ASCII) text. To detect just ASCII, the following value
-#'                  can be used: `as.raw(c(7:16,18,19,32:127))`
-#' @param tf       If `TRUE` (default) simply return `TRUE` when `filename` 
-#'                 references a text-only file and `FALSE` otherwise. If set to
-#'                 `FALSE` then returns the "non text" bytes found in the file.
-#'
-#' @return boolean or raw
+#' @return boolean
 #' @export
 #' @examples
 #' library(datasets)
-#' export(iris,"iris.yml")
-#' isfiletext("iris.yml")
+#' export(iris, "iris.yml")
+#' is_file_text("iris.yml")
 #' ## TRUE
 #' 
-#' export(iris,"iris.sav")
-#' isfiletext("iris.sav")
+#' export(iris, "iris.sav")
+#' is_file_text("iris.sav")
 #' ## FALSE
-#' isfiletext("iris.sav", tf=FALSE)
-#' ## These are the characters found in "iris.sav" that are not printable text
-#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f
-isfiletext <- function(filename,maxsize=Inf,
-                       textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)),
-                       tf=TRUE){
-  bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size,
-                                                       maxsize));
-  close(ff);
-  nontextbytes <- setdiff(bytes,textbytes);
-  if(tf) return(length(nontextbytes)==0) else return(nontextbytes);
+#' 
+is_file_text <- function(file, maxsize = Inf, 
+                       text_bytes = as.raw(c(0x7:0x10, 0x12, 0x13, 0x20:0xFF))) {
+  
+  bytes <- readBin(ff <- file(file, "rb"), raw(), 
+                   n = min(file.info(file)$size, maxsize))
+  close(ff)
+
+  non_text_bytes <- setdiff(bytes, text_bytes)
+  result <- length(non_text_bytes) == 0
+  if (getOption("rio.is_file_text.debug", FALSE)) {
+    attr(result, "non_text_bytes") <- non_text_bytes
+  }
+  
+  return(result)
 }
\ No newline at end of file
diff --git a/man/is_file_text.Rd b/man/is_file_text.Rd
new file mode 100644
index 0000000..66b4271
--- /dev/null
+++ b/man/is_file_text.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/isfiletext.R
+\name{is_file_text}
+\alias{is_file_text}
+\title{Determine whether a file is "plain-text" or some sort of binary format}
+\usage{
+is_file_text(file, maxsize = Inf, text_bytes = as.raw(c(7:16, 18, 19, 32:255)))
+}
+\arguments{
+\item{file}{Path to the file}
+
+\item{maxsize}{Maximum number of bytes to read}
+
+\item{text_bytes}{Which characters are used by normal text (though not 
+necessarily just ASCII). To detect just ASCII, the 
+following value can be used: 
+\code{as.raw(c(7:16, 18, 19, 32:127))}}
+}
+\value{
+boolean
+}
+\description{
+For debugging (i.e. why is a file being deemed non-text) set the option
+\code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will 
+also have an attribute named \code{"non_text_bytes"} which will be a vector 
+of the raw bytes which prevented the file from being identified as text
+}
+\examples{
+library(datasets)
+export(iris, "iris.yml")
+is_file_text("iris.yml")
+## TRUE
+
+export(iris, "iris.sav")
+is_file_text("iris.sav")
+## FALSE
+
+}
diff --git a/man/isfiletext.Rd b/man/isfiletext.Rd
deleted file mode 100644
index 78aa944..0000000
--- a/man/isfiletext.Rd
+++ /dev/null
@@ -1,45 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/isfiletext.R
-\name{isfiletext}
-\alias{isfiletext}
-\title{Determine whether a file is "plain-text" or some sort of binary format}
-\usage{
-isfiletext(
-  filename,
-  maxsize = Inf,
-  textbytes = as.raw(c(7:16, 18, 19, 32:255)),
-  tf = TRUE
-)
-}
-\arguments{
-\item{filename}{Path to the file}
-
-\item{maxsize}{Maximum number of bytes to read}
-
-\item{textbytes}{Which characters are used by normal (though not necessarily 
-just ASCII) text. To detect just ASCII, the following value
-can be used: `as.raw(c(7:16,18,19,32:127))`}
-
-\item{tf}{If `TRUE` (default) simply return `TRUE` when `filename` 
-references a text-only file and `FALSE` otherwise. If set to
-`FALSE` then returns the "non text" bytes found in the file.}
-}
-\value{
-boolean or raw
-}
-\description{
-Determine whether a file is "plain-text" or some sort of binary format
-}
-\examples{
-library(datasets)
-export(iris,"iris.yml")
-isfiletext("iris.yml")
-## TRUE
-
-export(iris,"iris.sav")
-isfiletext("iris.sav")
-## FALSE
-isfiletext("iris.sav", tf=FALSE)
-## These are the characters found in "iris.sav" that are not printable text
-## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f
-}
diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R
index 0ea6629..5a4d4ac 100644
--- a/tests/testthat/test_isfiletext.R
+++ b/tests/testthat/test_isfiletext.R
@@ -1,71 +1,72 @@
 context("correctly identifying files as text vs binary")
 require("datasets")
 
-txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt")
-binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt")
-names(iris) <- gsub("\\.","_",names(iris))
+txtformats <- c("arff", "csv", "csv2", "dump", "fwf", "psv", "r", "tsv", "txt")
+binformats <- c("dbf", "dta", "rda", "rdata", "rds", "sas7bdat", "sav", "xlsx", 
+                "xpt")
+names(iris) <- gsub("\\.", "_", names(iris))
 
 test_that("Required text formats recognized as text", {
-  for(xx in txtformats) expect_true(isfiletext(export(iris,
-                                                      paste0("iris.",xx))),
-                                    label = paste0(xx," should be text"))
-  })
+  for (xx in txtformats) {
+    expect_true(is_file_text(export(iris, paste0("iris.", xx))), 
+                label = paste0(xx, " should be text"))
+  }
+})
 
 test_that("Required non-text formats recognized as non-text", {
-  for(xx in binformats) expect_false(isfiletext(export(iris,
-                                                       paste0("iris.",xx))),
-                                     label = paste0(xx," should be text"))
-  })
+  for (xx in binformats) {
+    expect_false(is_file_text(export(iris, paste0("iris.", xx))), 
+                 label = paste0(xx, " should not be text"))
+  }
+})
 
 test_that("csvy recognized as text", {
-  skip_if_not_installed(pkg="csvy")
-  expect_true(isfiletext(export(iris,'iris.csvy')))
-  })
+  skip_if_not_installed(pkg = "csvy")
+  expect_true(is_file_text(export(iris, "iris.csvy")))
+})
 
 test_that("xml and html recognized as text", {
-  skip_if_not_installed(pkg="xml2")
-  expect_true(isfiletext(export(iris,'iris.xml')))
-  expect_true(isfiletext(export(iris,'iris.html')))
-  })
+  skip_if_not_installed(pkg = "xml2")
+  expect_true(is_file_text(export(iris, "iris.xml")))
+  expect_true(is_file_text(export(iris, "iris.html")))
+})
 
 test_that("json recognized as text", {
-  skip_if_not_installed(pkg="jsonlite")
-  expect_true(isfiletext(export(iris,'iris.json')))
-  })
+  skip_if_not_installed(pkg = "jsonlite")
+  expect_true(is_file_text(export(iris, "iris.json")))
+})
 
 test_that("yml recognized as text", {
-  skip_if_not_installed(pkg="yaml")
-  expect_true(isfiletext(export(iris,'iris.yml')))
-  })
+  skip_if_not_installed(pkg = "yaml")
+  expect_true(is_file_text(export(iris, "iris.yml")))
+})
 
 test_that("pzfx recognized as text", {
-  skip_if_not_installed(pkg="pzfx")
-  expect_true(isfiletext(export(iris[,-5],"iris.pzfx")))
-  })
+  skip_if_not_installed(pkg = "pzfx")
+  expect_true(is_file_text(export(iris[,-5], "iris.pzfx")))
+})
 
-# binformats_suggest <- c("matlab","ods","fst","feather")
 test_that("matlab recognized as binary", {
-  skip_if_not_installed(pkg="rmatio")
-  expect_false(isfiletext(export(iris,'iris.matlab')))
+  skip_if_not_installed(pkg = "rmatio")
+  expect_false(is_file_text(export(iris, "iris.matlab")))
 })
 
 test_that("ods recognized as binary", {
-  skip_if_not_installed(pkg="readODS")
-  expect_false(isfiletext(export(iris,'iris.ods')))
+  skip_if_not_installed(pkg = "readODS")
+  expect_false(is_file_text(export(iris, "iris.ods")))
 })
 
 test_that("fst recognized as binary", {
-  skip_if_not_installed(pkg="fst")
-  expect_false(isfiletext(export(iris,'iris.fst')))
+  skip_if_not_installed(pkg = "fst")
+  expect_false(is_file_text(export(iris, "iris.fst")))
 })
 
 test_that("feather recognized as binary", {
-  skip_if_not_installed(pkg="feather")
-  expect_false(isfiletext(export(iris,'iris.feather')))
+  skip_if_not_installed(pkg = "feather")
+  expect_false(is_file_text(export(iris, "iris.feather")))
 })
 
-unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json',
-                        'yml','pzfx','matlab','ods',
-                        'fst','feather')))
-rm(iris,txtformats,binformats)
+unlink(paste0("iris.", c(txtformats, binformats, "csvy", "xml", "html", "json", 
+                         "yml", "pzfx", "matlab", "ods", "fst", "feather")))
+rm(iris, txtformats, binformats)
 

From 61e6b52ac4bcce068d23a588ec7390b313f701b5 Mon Sep 17 00:00:00 2001
From: "Alex F. Bokov, Ph.D" <alex.bokov@gmail.com>
Date: Fri, 20 Dec 2019 20:13:29 -0600
Subject: [PATCH 4/4] Incorporating review feedback

---
 R/isfiletext.R | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/R/isfiletext.R b/R/isfiletext.R
index efe6bb1..a845474 100644
--- a/R/isfiletext.R
+++ b/R/isfiletext.R
@@ -1,9 +1,5 @@
-#' Determine whether a file is "plain-text" or some sort of binary format
+#' @title Determine whether a file is "plain-text" or some sort of binary format
 #' 
-#' For debugging (i.e. why is a file being deemed non-text) set the option
-#' \code{"rio.is_file_text.debug"} to \code{TRUE}. Then the result returned will 
-#' also have an attribute named \code{"non_text_bytes"} which will be a vector 
-#' of the raw bytes which prevented the file from being identified as text
 #' 
 #' @param file       Path to the file
 #' @param maxsize    Maximum number of bytes to read
@@ -12,7 +8,7 @@
 #'                   following value can be used: 
 #'                   \code{as.raw(c(7:16, 18, 19, 32:127))}
 #'
-#' @return boolean
+#' @return A logical
 #' @export
 #' @examples
 #' library(datasets)
@@ -31,11 +27,5 @@ is_file_text <- function(file, maxsize = Inf,
                    n = min(file.info(file)$size, maxsize))
   close(ff)
 
-  non_text_bytes <- setdiff(bytes, text_bytes)
-  result <- length(non_text_bytes) == 0
-  if (getOption("rio.is_file_text.debug", FALSE)) {
-    attr(result, "non_text_bytes") <- non_text_bytes
-  }
-  
-  return(result)
-}
\ No newline at end of file
+  return(length(setdiff(bytes, text_bytes)) == 0)
+}