From 5000d39895b752e32be9524efb06e16e1eb52e36 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Nov 2023 15:01:19 +0100 Subject: [PATCH 1/5] add fix --- DESCRIPTION | 2 +- R/fread.R | 7 ++----- man/fread.Rd | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 74a4b6e1c..3e4b7b53b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Version: 1.14.9 Title: Extension of `data.frame` Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE diff --git a/R/fread.R b/R/fread.R index e0337c591..3cd19f789 100644 --- a/R/fread.R +++ b/R/fread.R @@ -76,11 +76,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 # nocov start tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - if (w<=2L) { # https: or ftps: - if (!requireNamespace("curl", quietly = TRUE)) - stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - - curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) + if (w<=2L && base::getRversion()<"3.2.2") { + stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") } else { method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 else getOption("download.file.method", default="auto") # http: or ftp: diff --git a/man/fread.Rd b/man/fread.Rd index cc96062de..de47df055 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -115,7 +115,7 @@ Currently, the \code{yaml} setting is somewhat inflexible with respect to incorp \bold{File Download:} -When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. Secure URLS (ftps:// and https://) are downloaded with \code{curl::curl_download}; ftp:// and http:// paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. +When \code{input} begins with http://, https://, ftp://, ftps://, or file://, \code{fread} detects this and \emph{downloads} the target to a temporary file (at \code{tempfile()}) before proceeding to read the file as usual. URLS (ftps:// and https:// as well as ftp:// and http://) paths are downloaded with \code{download.file} and \code{method} set to \code{getOption("download.file.method")}, defaulting to \code{"auto"}; and file:// is downloaded with \code{download.file} with \code{method="internal"}. NB: this implies that for file://, even files found on the current machine will be "downloaded" (i.e., hard-copied) to a temporary file. See \code{\link{download.file}} for more details. \bold{Shell commands:} From 3c0d26ffdacd7210606a147ab70a022bff69ebc7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Nov 2023 15:07:03 +0100 Subject: [PATCH 2/5] add comment --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 3cd19f789..31c3abb33 100644 --- a/R/fread.R +++ b/R/fread.R @@ -76,7 +76,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 # nocov start tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - if (w<=2L && base::getRversion()<"3.2.2") { + if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") } else { method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 From 55c62602295b663d35b43590ebf340ab44dad4f8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Nov 2023 15:09:10 +0100 Subject: [PATCH 3/5] update --- R/fread.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/R/fread.R b/R/fread.R index 31c3abb33..92f419479 100644 --- a/R/fread.R +++ b/R/fread.R @@ -78,12 +78,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") - } else { - method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 - else getOption("download.file.method", default="auto") # http: or ftp: - download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" } + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) file = tmpFile on.exit(unlink(tmpFile), add=TRUE) # nocov end From 2896ebc9a3968350070aeddebd0ee87aaf397e00 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 6 Dec 2023 15:28:33 +0100 Subject: [PATCH 4/5] update comments --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 92f419479..8e9a11b12 100644 --- a/R/fread.R +++ b/R/fread.R @@ -76,7 +76,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 # nocov start tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: + if (w<=2L && base::getRversion()<"3.2.2") { # https: or ftps: can be read by default by download.file() since 3.2.2 stopf("URL requires download.file functionalities from R >=3.2.2. You can still manually download the file and fread the downloaded file.") } method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 From 78894fd6744d1c6c331c06bec8e25b55b424e5a2 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 6 Dec 2023 15:57:17 +0100 Subject: [PATCH 5/5] add NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 52333e9b3..0aaa2e436 100644 --- a/NEWS.md +++ b/NEWS.md @@ -561,6 +561,8 @@ identical(DT1, DT2) # TRUE ``` +55. `fread(URL)` with `https:` and `ftps:` could timeout if proxy settings were not guessed right by `curl::curl_download`, [#1686](https://github.com/Rdatatable/data.table/issues/1686). `fread(URL)` now uses `download.file()` as default for downloading files from urls. Thanks to @cderv for the report and Benjamin Schwendinger for the fix. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :