Skip to content

Commit

Permalink
Merge pull request #3 from geysertimes/data-load
Browse files Browse the repository at this point in the history
Initial package for getting and loading the data.
  • Loading branch information
TR4Android authored May 12, 2019
2 parents 9284fbe + 5627acd commit c24ac9f
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 0 deletions.
16 changes: 16 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Package: geysertimes
Title: Geyser Data from GeyserTimes.org
Version: 0.0.0.9000
Imports: lubridate, rappdirs, readr
Authors@R:
person(given = "Stephen",
family = "Kaluzny",
role = c("aut", "cre"),
email = "[email protected]")
Description: Gets geyser eruption and observation data from the GeyserTimes
database and optionally stores it locally.
License: MIT + file LICENSE
VignetteBuilder: knitr
Encoding: UTF-8
LazyData: true
NeedsCompilation: no
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export(
gt_get_data,
gt_load_data,
gt_path,
gt_version
)
33 changes: 33 additions & 0 deletions R/gt_get_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
gt_get_data <- function(dest_folder = file.path(tempdir(), "GeyserTimes"),
overwrite=FALSE, quiet=FALSE, version=lubridate::today()) {
if(dest_folder != gt_path()) {
if(!quiet) {
message("Set dest_folder to GeyserTimes::gt_path() so that data persists between R sessions.\n")
}
}
outpath <- file.path(dest_folder, version, "eruptions_data.rds")
if(file.exists(outpath) && !overwrite) {
warning("GeyserTimes data for this version already exists on the local machine. Use the 'overwrite' argument to re-download if neccessary.")
return(invisible(outpath))
}
outdir <- dirname(outpath)
if(!dir.exists(outdir)) {
dir.create(outdir, recursive=TRUE)
}
base_url <- "https://geysertimes.org/archive/complete/"
raw_data_file <- paste0("geysertimes_eruptions_complete_", version, ".tsv.gz")
download_data_file_path <- file.path(tempdir(), raw_data_file)
data_url <- paste0(base_url, raw_data_file)
oldOpt <- options(warn=-1)
on.exit(options(oldOpt))
trydownload <- try(
download.file(data_url, destfile=download_data_file_path, quiet=TRUE),
silent=TRUE)
gt_tib <- readr::read_tsv(gzfile(download_data_file_path),
col_types=c("dcddddddddddddccccdddc"), quote="", progress=FALSE)
gt_tib[["eruption_time_epoch"]] <- lubridate::as_datetime(gt_tib[["eruption_time_epoch"]])
gt_tib[["time_updated"]] <- lubridate::as_datetime(gt_tib[["time_updated"]])
gt_tib[["time_entered"]] <- lubridate::as_datetime(gt_tib[["time_entered"]])
saveRDS(gt_tib, file=outpath)
invisible(outpath)
}
22 changes: 22 additions & 0 deletions R/gt_load_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"gt_load_data" <- function(path=gt_path(), quiet=FALSE, version=NULL) {
if(is.null(version)) {
version <- gt_version(path, quiet=TRUE)
}
if(is.null(version)) {
if(!quiet) {
message("Cannot find any GeyserTimes data under ", path)
}
# Look in Rtmp
path <- file.path(tempdir(), "GeyserTimes")
version <- gt_version(path, quiet=TRUE)
if(is.null(version)) {
return(NULL)
} else {
if(!quiet) {
message("Loading data from ", path)
}
}
}
full_path <- file.path(path, version, "eruptions_data.rds")
readRDS(full_path)
}
7 changes: 7 additions & 0 deletions R/gt_path.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"gt_path" <- function(temp=FALSE) {
if(temp) {
file.path(tempdir(), "GeyserTimes")
} else {
rappdirs::user_data_dir(appname = "GeyserTimes", appauthor = "GeyserTimes")
}
}
19 changes: 19 additions & 0 deletions R/gt_version.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"gt_version" <- function(path=gt_path(), quiet=FALSE, all=FALSE) {
gt_files <- list.files(path, pattern="eruptions_data\\.rds$", recursive=TRUE)
versions <- as.Date(dirname(gt_files), format="%Y-%m-%d")
# only directories of form yyyy-mm-dd are allowed:
versions <- sort(versions[as.character(versions) == dirname(gt_files)],
decreasing=TRUE)
if(length(gt_files) < 1 || all(is.na(versions))) {
if(!quiet) {
message("Cannot find any GeyserTimes data under", path)
}
return(NULL)
}
version <- if(all) {
versions[!is.na(versions)]
} else {
versions[1]
}
version
}
59 changes: 59 additions & 0 deletions man/gt_get_data.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
\name{gt_get_data}
\alias{gt_get_data}
\title{
Download Geyser Times Data
}
\description{
Downloads the data from geysertimes.org.
Reads the data and creates a tibble object in `dest_dir`.
}
\usage{
gt_get_data(dest_dir = file.path(tempdir(), "GeyserTimes"),
overwrite = FALSE, quiet = FALSE, version = lubridate::today())
}
\arguments{
\item{dest_dir}{
the location where the binary tibble object should be written.
The default is under the current R session's temp directory
which will disappear when the session ends.
}
\item{overwrite}{
a logical value,
if\code{FALSE}, the data will not be downloaded again if copy of the
data, with \code{version}, already exists in \code{dest_dir}.
}
\item{quiet}{
a logical value, if \code{TRUE}, no messages are displayed.
}
\item{version}{
a character string giving the version of the data to download.
This should a date in the form \code{yyyy-mm-dd}.
Typically, only the version with today's date is available.
}
}
\details{
The data is downloaded from the GeyserTimes archive web site
\url{https://geysertimes.org/archive/} to the \code{tempdir()} directory.
The data is then read with \code{readr::read_tsv} wtih appropriate
column types.
The resulting \code{tibble} object is then saved as an binary (\code{.rds})
in \code{dest_dir}.
}
\value{
a character string giving the full path to GeyserTimes data object.
}
\author{
Stephen Kaluzny <spkaluzny@gmail.com>.
}
\note{
Users are encouraged to set \code{dest_dir} to \code{gt_path()} to save
a persistent copy of the data.
}
\seealso{
gt_load_data.
}
\examples{
dpath0 <- gt_get_data() # data saved under tempdir()
dpath1 <- gt_get_data(dest_dir=gt_path()) # data saved under gt_path()
}
\keyword{geysertimes}
49 changes: 49 additions & 0 deletions man/gt_load_data.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
\name{gt_load_data}
\alias{gt_load_data}
\title{
Load the Geyser Times Data
}
\description{
Loads the Geyser Times data that was previously downloaded by a call
to \code{gt_get_data}.
}
\usage{
gt_load_data(path = gt_path(), quiet = FALSE, version = NULL)
}
\arguments{
\item{path}{
a character string, the local location where the Geyser Times data
has been written.
The default is the local permanent location given by \code{gt_path()}.
If no appropriate data is found at that location,
the function will look in the temporary location given
\code{by gt_path(temp=TRUE)}.
}
\item{quiet}{
a logical value, if \code{TRUE}, no messages are displayed.
}
\item{version}{
a character string giving the version of the Geyser Times data to load.
Calling \code{gt_version(path, all=TRUE)} will list all versions
available under \code{path}.
}
}
\details{
Typically, a user would download the data once, with a call to
`gt_get_data(destdir=gt_path())`.
Subsequent R sessions can the load this downloaded data with a
call to `gt_load_data()`.
}
\value{
a tibble containing the Geyser Times data with names:
(need to decide on appropriate names)
}
\author{
Stephen Kaluzny <spkaluzny@gmail.com>
}
\seealso{
\code{gt_get_data}.
}
\examples{
}
\keyword{geysertimes}
37 changes: 37 additions & 0 deletions man/gt_path.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
\name{gt_path}
\alias{gt_path}
\title{
Path to GeyserTimes Local Data
}
\description{
Returns the path where local GeyserTimes data is stored.
}
\usage{
gt_path(temp = FALSE)
}
\arguments{
\item{temp}{
a logical value, if \code{TRUE}, the temporary path is returned.
This location will disappear when the R session ends.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
a character string giving the full path where local GeyserTimes data is stored.
}
\author{
Stephen Kaluzny <spkaluzny@gmail.com>
}
\note{
%% ~~further notes~~
}
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
# The default location for the GeyserTimes data:
gt_path()
}
\keyword{geysertimes}
42 changes: 42 additions & 0 deletions man/gt_version.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
\name{gt_version}
\alias{gt_version}
\title{Version of GeyserTimes Data}
\description{
Returns the version of the current GeyserTimes data.
This is a character string date in year-mm-dy format.
}
\usage{
gt_version(path = gt_path(), quiet = FALSE, all = FALSE)
}
\arguments{
\item{path}{
the path to the GeyserTimes data.
The default is the suggested location used by `gt_get_data`.
}
\item{quiet}{
a logical value, if \code{TRUE}, no messages are printed.
}
\item{all}{
list all versions of the GeyserTimes data found,
not just the newest.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
a character string listing the version(s) of GeyserTimes data
stored under `path`.
}
\author{
Stephen Kaluzny <spkaluzny@gmail.com>
}
\note{
}
\seealso{
\code{gt_path}
}
\examples{
gt_version()
}
\keyword{geysertimes}
85 changes: 85 additions & 0 deletions vignettes/geysertimes.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
---
title: "Introduction to geysertimes"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to geysertimes}
%\VignetteEngine{knitr::rmarkdown}
---

```{r, echo = FALSE, message = FALSE}
knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
```

# Basic Use

Load the package
```{r library}
library("geysertimes")
```

## Get the Data

The `gt_get_data` function downloads the compressed eruptions data
from `https://geysertimes.org/archive/`,
reads the data compressed data into R
and saves version of the R object
in the location specified
in the `dest_folder` argument to the function.
The default location for `dest_folder` is
`file.path(tempdir(), "GeyserTimes"))`.
This default location is used to meet the CRAN requirement of
not writing files by default to any location other than under `tempdir()`.

```{r default_get}
defpath <- gt_get_data()
defpath
```

Users are encouraged to set `dest_folder` to the value given by
`gt_path()` which is a permanent location appropriate for the
user on the particular platform.

```{r gt_path}
gt_path()
```

If a permanent location is used, the user only needs to get the
data once.
Using the suggested value for `dest_folder`:
```{r recommend_path}
recpath <- gt_get_data(dest_folder=gt_path())
recpath
```

## Load the Data

The `gt_load_data` is used to load the saved R object.

```{r load01}
gtdata <- gt_load_data()
```

A quick look at the data:
```{r look}
dim(gtdata)
names(gtdata)
```

### Data Version
The data that is downloaded is versioned.
The version id is the date when the data was downloaded.

The `gt_version()` lists the latest version of the data that
has been downloaded.
Setting `all=TRUE` will list all versions of the data that have been
downloaded.

```{r version}
gt_version()
```

```{r version_all}
gt_version(all=TRUE)
```

0 comments on commit c24ac9f

Please sign in to comment.