From f8b24f8cdd4a3deea4df9f18a509476754e8733b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 19 May 2024 17:14:08 +0200 Subject: [PATCH] fread read compressed .bgz (#5474) * fread read compresed .bgz * merge master * Update NEWS.md Co-authored-by: Michael Chirico --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fread.R | 4 ++-- inst/tests/ch11b.dat.bgz | Bin 0 -> 627 bytes inst/tests/tests.Rraw | 3 +++ 4 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 inst/tests/ch11b.dat.bgz diff --git a/NEWS.md b/NEWS.md index 9163d9eb3..e5dc3e4c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -42,6 +42,8 @@ 13. `dcast`gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. +14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fread.R b/R/fread.R index fc22e9c54..1ce637eaf 100644 --- a/R/fread.R +++ b/R/fread.R @@ -116,10 +116,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") gz_signature = as.raw(c(0x1F, 0x8B)) bz2_signature = as.raw(c(0x42, 0x5A, 0x68)) gzsig = FALSE - if ((w <- endsWithAny(file, c(".gz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { + if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (w==1L || gzsig) gzfile else bzfile + FUN = if (w<=2L || gzsig) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/inst/tests/ch11b.dat.bgz b/inst/tests/ch11b.dat.bgz new file mode 100644 index 0000000000000000000000000000000000000000..5ef72ad01536454024db1822ec310eed5ed1a9c3 GIT binary patch literal 627 zcmV-(0*w71iwFb&00000{{{d;LjnL+0#%ejvfv;H1^53HIl(Uh6}|t(CgF9>FxxKT zv%xTiJu&hbH^vkDcb#YajN!mpFr#1qSHUYd3)&bj%wJmt7tjj2DmTytXTbzUhX<(i zI|^PP=@s;y|3MoC&n;*`>y-{v`lvH4fYE{sBO_gBVq~jBj9l(-CqIah%YBrQVdUN+ zMlN?N=otA_WfsK9Nv|0>M$yw9qrhFCV-(&YMnSq4#3+<*H8{s8R=+1kv(McznsT=a z3XG<=pulMM^uTD&{y9cd>0a)E(Hb3Mw3I&Uo`KO)`l>DkMoZ~il^CtrD=}J1e>-nK z^(;TQzx4C_{wyEFXq(`8?0G3l+l#)FD2mgLqVE^q0XXgXxAcWL?WIa9h*SQTk5gK& z#3>a&1aV5)-+~LLl>OZWaZ07X3*dt&9i_ih`XEM!^ji?4qjan0EsPHCs~|>4?xWln zMrZXaVRSY5qe_gfCV%Rl8>8#3a$|Jmo`M)%xn~7!jPB}CM$Wd|`g{SEzLma#bY7n) zFj|-P0m?n9A^QT=c`ukUa>KpOOam(Ys?vepVF05UB}OLqT~9GFGP$3+XJX|4rDMdl z%X&+R5!)_n?uilG?$o0_7}5=YB8>YtQ>&6yAQWgHiNXcQ9hxU0n$y zww*?NKBITZF!$7%7s$S;%ijT%`>ZQn14gfOAp1rzfJ$F=&pQQ~dkQX~wbtDYr1SL@ zZvvTn{je7!o2_*hBU}G|!wUVNvu*ORpFc+x%}X{2001A02m}BC000301^_}s0stET N0{{R300000008`@Amji5 literal 0 HcmV?d00001 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 90ba1f73d..0df904956 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17903,6 +17903,9 @@ if (test_R.utils) { test(2229.4, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) file.copy(testDir("issue_785_fread.txt.gz"), f, overwrite=TRUE) test(2229.5, fread(f, logical01=FALSE)[,25], data.table(Sv3 = c(10,14,14,15))) + # support .bgz with fread #5461 + file.copy(testDir("ch11b.dat.bgz"), f, overwrite=TRUE) + test(2229.55, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) } unlink(f) # not supporting multi file zips yet