From 2c8a6f4ae948a684785714397abdd2573889d0bf Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 00:36:15 +0100 Subject: [PATCH 01/10] support date columns --- R/api.R | 21 +++++++++++++++++---- inst/schema/DataSeries.schema.json | 4 ++-- inst/schema/Variable.schema.json | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/R/api.R b/R/api.R index 17549fa..9361fc7 100644 --- a/R/api.R +++ b/R/api.R @@ -72,7 +72,7 @@ target_get_dataset <- function(name, req) { variables <- list() for (col in cols) { lvls <- unique(dat[, col]) - if (length(lvls) < 12) { + if (length(lvls) < 25) { variables[[col]] <- list(name = jsonlite::unbox(col), levels = lvls) } } @@ -100,6 +100,7 @@ target_get_trace <- function(name, dataset <- read_dataset(req, name, scale) dat <- dataset$data xcol <- dataset$xcol + xtype <- dataset$xtype cols <- colnames(dat) if (!is.null(filter)) { filters <- strsplit(filter, "+", fixed = TRUE)[[1]] @@ -117,6 +118,7 @@ target_get_trace <- function(name, return(lapply(seq_along(groups), function(i) { model <- with_warnings(model_out(groups[[i]], xcol = xcol, + xtype = xtype, method = method, span = span, k = k)) @@ -129,6 +131,7 @@ target_get_trace <- function(name, logger::log_info("Returning single trace") model <- with_warnings(model_out(dat, xcol = xcol, + xtype = xtype, method = method, span = span, k = k)) @@ -156,11 +159,18 @@ read_dataset <- function(req, name, scale) { if (scale == "log2") { dat$value <- log2(dat$value) } + if (all(is.na(as.numeric(dat[, xcol])))) { + xtype <- "date" + dat[, xcol] <- as.Date(lubridate::parse_date_time(dat[, xcol], + c("dmy", "mdy", "ymd", "ydm"))) + } else { + xtype <- "number" + } xcol <- readLines(file.path(path, "xcol")) - list(data = dat, xcol = xcol) + list(data = dat, xcol = xcol, xtype = xtype) } -model_out <- function(dat, xcol, method = "auto", span = 0.75, k = 10) { +model_out <- function(dat, xcol, xtype, method = "auto", span = 0.75, k = 10) { n <- nrow(dat) if (n == 0) { return(list(x = list(), y = list())) @@ -170,13 +180,16 @@ model_out <- function(dat, xcol, method = "auto", span = 0.75, k = 10) { m <- mgcv::gam(eval(parse(text = fmla)), data = dat, method = "REML") } else { - fmla <- sprintf("value ~ %s", xcol) + fmla <- sprintf("value ~ as.numeric(%s)", xcol) m <- stats::loess(fmla, data = dat, span = span) } range <- range(dat[, xcol], na.rm = TRUE) xseq <- range[1]:range[2] xdf <- tibble::tibble(xcol = xseq) names(xdf) <- xcol + if (xtype == "date") { + xseq <- format(as.Date(xseq, origin = "1970-01-01")) + } list(x = xseq, y = stats::predict(m, xdf)) } diff --git a/inst/schema/DataSeries.schema.json b/inst/schema/DataSeries.schema.json index 15d17f9..bc642f4 100644 --- a/inst/schema/DataSeries.schema.json +++ b/inst/schema/DataSeries.schema.json @@ -13,7 +13,7 @@ "x": { "type": "array", "items": { - "type": "number" + "type": ["number", "string"] } }, "y": { @@ -32,7 +32,7 @@ "x": { "type": "array", "items": { - "type": "number" + "type": ["number", "string"] } }, "y": { diff --git a/inst/schema/Variable.schema.json b/inst/schema/Variable.schema.json index b963a79..2edc88f 100644 --- a/inst/schema/Variable.schema.json +++ b/inst/schema/Variable.schema.json @@ -8,7 +8,7 @@ "levels": { "type": "array", "items": { - "type": "string" + "type": ["string", "number", "null"] } } }, From bc3ce5d5a3312e44d40f1d6051ef38d0e8ae7882 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 00:44:36 +0100 Subject: [PATCH 02/10] read xcol in right order --- R/api.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/api.R b/R/api.R index 9361fc7..6ac1fc9 100644 --- a/R/api.R +++ b/R/api.R @@ -159,6 +159,7 @@ read_dataset <- function(req, name, scale) { if (scale == "log2") { dat$value <- log2(dat$value) } + xcol <- readLines(file.path(path, "xcol")) if (all(is.na(as.numeric(dat[, xcol])))) { xtype <- "date" dat[, xcol] <- as.Date(lubridate::parse_date_time(dat[, xcol], @@ -166,7 +167,6 @@ read_dataset <- function(req, name, scale) { } else { xtype <- "number" } - xcol <- readLines(file.path(path, "xcol")) list(data = dat, xcol = xcol, xtype = xtype) } From 4d43c59b04622bf668842b2265644f38ce2cc75c Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 00:53:42 +0100 Subject: [PATCH 03/10] use default --- R/api.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/api.R b/R/api.R index 6ac1fc9..bd8d5ca 100644 --- a/R/api.R +++ b/R/api.R @@ -170,7 +170,11 @@ read_dataset <- function(req, name, scale) { list(data = dat, xcol = xcol, xtype = xtype) } -model_out <- function(dat, xcol, xtype, method = "auto", span = 0.75, k = 10) { +model_out <- function(dat, xcol, + xtype = "number", + method = "auto", + span = 0.75, + k = 10) { n <- nrow(dat) if (n == 0) { return(list(x = list(), y = list())) From ced7e8b4610e3488319ba9d180fd937adf4e0993 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 00:58:01 +0100 Subject: [PATCH 04/10] validate xcol --- R/api.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/api.R b/R/api.R index bd8d5ca..3fd51d7 100644 --- a/R/api.R +++ b/R/api.R @@ -160,11 +160,19 @@ read_dataset <- function(req, name, scale) { dat$value <- log2(dat$value) } xcol <- readLines(file.path(path, "xcol")) + logger::log_info("Parsing x column values") if (all(is.na(as.numeric(dat[, xcol])))) { xtype <- "date" dat[, xcol] <- as.Date(lubridate::parse_date_time(dat[, xcol], c("dmy", "mdy", "ymd", "ydm"))) + if (all(is.na(dat[, xcol]))) { + msg <- paste("Invalid x column values:", + "these should be numbers or dates in a standard format") + porcelain::porcelain_stop(msg) + } + logger::log_info("Detected date values in x column") } else { + logger::log_info("Deteced numeric values in x column") xtype <- "number" } list(data = dat, xcol = xcol, xtype = xtype) From 12254a6c5bc7a3c6b117443542417206e07d2bed Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 10:17:32 +0100 Subject: [PATCH 05/10] add luibridate --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 544bd38..9aee150 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Imports: fs, jsonlite, logger, + lubridate, mgcv, mime, plumber, From ae886fa39475c54c6aad7ffcfdb8032351c6c737 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 10:39:10 +0100 Subject: [PATCH 06/10] add dep to dockerfile --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index a622ec4..883dd43 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,6 +17,7 @@ RUN install_packages --repo=https://mrc-ide.r-universe.dev \ docopt \ fs \ logger \ + lubridate \ jsonlite \ jsonvalidate \ plumber \ From 2e0b656b6925d6767fee4db95ac0f7e16abee735 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 11:42:59 +0100 Subject: [PATCH 07/10] add tests --- R/api.R | 4 ++-- tests/testthat/test-model.R | 16 ++++++++++++++++ tests/testthat/test-read.R | 28 ++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/R/api.R b/R/api.R index 3fd51d7..9c386c6 100644 --- a/R/api.R +++ b/R/api.R @@ -161,11 +161,11 @@ read_dataset <- function(req, name, scale) { } xcol <- readLines(file.path(path, "xcol")) logger::log_info("Parsing x column values") - if (all(is.na(as.numeric(dat[, xcol])))) { + if (suppressWarnings(all(is.na(as.numeric(dat[, xcol]))))) { xtype <- "date" dat[, xcol] <- as.Date(lubridate::parse_date_time(dat[, xcol], c("dmy", "mdy", "ymd", "ydm"))) - if (all(is.na(dat[, xcol]))) { + if (suppressWarnings(all(is.na(dat[, xcol])))) { msg <- paste("Invalid x column values:", "these should be numbers or dates in a standard format") porcelain::porcelain_stop(msg) diff --git a/tests/testthat/test-model.R b/tests/testthat/test-model.R index 61ed0b6..7310684 100644 --- a/tests/testthat/test-model.R +++ b/tests/testthat/test-model.R @@ -66,3 +66,19 @@ test_that("model uses loess options", { expect_true(all(res$y == expected)) }) + +test_that("model can handle dates", { + dates <- sapply(1:50, function(x) as.Date(2 * x, origin = "2023-01-01")) + full_range <- sapply(2:100, function(x) as.Date(x, origin = "2023-01-01")) + dat <- data.frame(date = dates, value = rnorm(50)) + res <- model_out(dat, xcol = "date", xtype = "date") + + m <- stats::loess(value ~ as.numeric(date), data = dat) + xdf <- tibble::tibble(date = full_range) + expected <- stats::predict(m, xdf) + + expect_true(all(res$y == expected)) + expect_equal(res$x[1], "2023-01-03") + expect_equal(res$x[2], "2023-01-04") + expect_equal(res$x[99], "2023-04-11") +}) diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index c0b0d27..9623102 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -289,3 +289,31 @@ test_that("error running the model results in a 400", { expect_equal(body$errors[1, "detail"], "day has insufficient unique values to support 10 knots: reduce k.") }) + +test_that("can get dataset with dates", { + dates <- c("2023/15/01", "2023/16/01", "2023/17/01", "2023/18/01", "2023/20/01") + dat <- data.frame(biomarker = "ab", + value = 1:5, + day = dates) + router <- build_routes(cookie_key) + local_add_dataset(dat, name = "testdataset") + res <- router$call(make_req("GET", + "/dataset/testdataset/trace/ab/", + HTTP_COOKIE = cookie)) + expect_equal(res$status, 200) + body <- jsonlite::fromJSON(res$body) + data <- body$data + expect_equal(nrow(data), 1) + expect_equal(data$name, "all") + expect_equal(unlist(data$raw[1, "x"]), c("2023-01-15", "2023-01-16", "2023-01-17", "2023-01-18", "2023-01-20")) + expect_equal(unlist(data$raw[1, "y"]), 1:5) + + full_range_dates <- c("2023-01-15", "2023-01-16", "2023-01-17", "2023-01-18", "2023-01-19", "2023-01-20") + expect_equal(unlist(data$model[1, "x"]), full_range_dates) + parsed <- lubridate::parse_date_time(dates, c("dmy", "mdy", "ymd", "ydm")) + suppressWarnings({m <- stats::loess(value ~ as.numeric(day), data = data.frame(day = parsed, + value = 1:5))}) + parsed_full_range <- lubridate::parse_date_time(full_range_dates, c("dmy", "mdy", "ymd", "ydm")) + expected <- stats::predict(m, parsed_full_range) + expect_equal(unlist(data$model[1, "y"]), expected) +}) \ No newline at end of file From 92cb94adaad025dc4ed6220fc541264e9ec6b867 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 12:19:53 +0100 Subject: [PATCH 08/10] parse dates on first upload --- R/api.R | 31 ++++++++++++++++++------------- R/utils.R | 5 +++++ tests/testthat/helper.R | 1 + tests/testthat/test-read.R | 5 ++++- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/R/api.R b/R/api.R index 9c386c6..446cfe6 100644 --- a/R/api.R +++ b/R/api.R @@ -46,10 +46,25 @@ target_post_dataset <- function(req, res) { return(bad_request_response(msg)) } + if (suppressWarnings(all(is.na(as.numeric(file_body[, xcol]))))) { + xtype <- "date" + file_body[, xcol] <- parse_date(file_body[, xcol]) + if (suppressWarnings(all(is.na(file_body[, xcol])))) { + msg <- paste("Invalid x column values:", + "these should be numbers or dates in a standard format") + return(bad_request_response(msg)) + } + logger::log_info("Detected date values in x column") + } else { + logger::log_info("Detected numeric values in x column") + xtype <- "number" + } + logger::log_info(paste("Saving dataset", filename, "to disk")) dir.create(path, recursive = TRUE) utils::write.csv(file_body, file.path(path, "data"), row.names = FALSE) write(xcol, file.path(path, "xcol")) + write(xtype, file.path(path, "xtype")) response_success(jsonlite::unbox(filename)) } @@ -160,20 +175,10 @@ read_dataset <- function(req, name, scale) { dat$value <- log2(dat$value) } xcol <- readLines(file.path(path, "xcol")) + xtype <- readLines(file.path(path, "xtype")) logger::log_info("Parsing x column values") - if (suppressWarnings(all(is.na(as.numeric(dat[, xcol]))))) { - xtype <- "date" - dat[, xcol] <- as.Date(lubridate::parse_date_time(dat[, xcol], - c("dmy", "mdy", "ymd", "ydm"))) - if (suppressWarnings(all(is.na(dat[, xcol])))) { - msg <- paste("Invalid x column values:", - "these should be numbers or dates in a standard format") - porcelain::porcelain_stop(msg) - } - logger::log_info("Detected date values in x column") - } else { - logger::log_info("Deteced numeric values in x column") - xtype <- "number" + if (xtype == "date") { + dat[, xcol] <- as.Date(dat[, xcol], origin = "1970-01-01") } list(data = dat, xcol = xcol, xtype = xtype) } diff --git a/R/utils.R b/R/utils.R index f0b6a3c..d09e7c9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -22,3 +22,8 @@ validate_scale <- function(scale) { ) } } + +parse_date <- function(dat) { + as.Date(lubridate::parse_date_time(dat, + c("dmy", "mdy", "ymd", "ydm"))) +} diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index de16728..0ea6413 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -22,6 +22,7 @@ local_add_dataset <- function(dat, name, session = session_id, env = parent.fram dir.create(filepath, recursive = TRUE) write.csv(dat, file.path(filepath, "data"), row.names = FALSE) write("day", file.path(filepath, "xcol")) + write("number", file.path(filepath, "xtype")) withr::defer(fs::dir_delete(filepath), envir = env) name } diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 9623102..6dc032d 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -296,7 +296,10 @@ test_that("can get dataset with dates", { value = 1:5, day = dates) router <- build_routes(cookie_key) - local_add_dataset(dat, name = "testdataset") + post_request <- local_POST_dataset_request(dat, + "testdataset", + cookie = cookie) + expect_equal(router$call(post_request)$status, 200) res <- router$call(make_req("GET", "/dataset/testdataset/trace/ab/", HTTP_COOKIE = cookie)) From 72492e510cdd5a6db4e9c684fb5885b3b9caf3e9 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 12:35:39 +0100 Subject: [PATCH 09/10] test different date formats --- R/api.R | 7 +-- tests/testthat/test-dates.R | 83 ++++++++++++++++++++++++++++++++++++ tests/testthat/test-upload.R | 17 ++++++++ 3 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-dates.R diff --git a/R/api.R b/R/api.R index 446cfe6..7db4591 100644 --- a/R/api.R +++ b/R/api.R @@ -48,10 +48,11 @@ target_post_dataset <- function(req, res) { if (suppressWarnings(all(is.na(as.numeric(file_body[, xcol]))))) { xtype <- "date" - file_body[, xcol] <- parse_date(file_body[, xcol]) - if (suppressWarnings(all(is.na(file_body[, xcol])))) { + suppressWarnings({file_body[, xcol] <- parse_date(file_body[, xcol])}) + if (all(is.na(file_body[, xcol]))) { + res$status <- 400L msg <- paste("Invalid x column values:", - "these should be numbers or dates in a standard format") + "these should be numbers or dates in a standard format.") return(bad_request_response(msg)) } logger::log_info("Detected date values in x column") diff --git a/tests/testthat/test-dates.R b/tests/testthat/test-dates.R new file mode 100644 index 0000000..11f7d05 --- /dev/null +++ b/tests/testthat/test-dates.R @@ -0,0 +1,83 @@ +test_that("d/m/y", { + router <- build_routes(cookie_key) + request <- local_POST_dataset_request(data.frame(biomarker = "ab", + day = c("01/01/2024", + "02/01/2024"), + value = 1:10), + "testdataset", + cookie = cookie) + upload_res <- router$call(request) + expect_equal(upload_res$status, 200) + + res <- router$call(make_req("GET", + "/dataset/testdataset/trace/ab/", + HTTP_COOKIE = cookie)) + + expect_equal(res$status, 200) + body <- jsonlite::fromJSON(res$body) + data <- body$data + expect_equal(unlist(data$raw[1, "x"]), rep(c("2024-01-01", "2024-01-02"), 5)) +}) + +test_that("d-m-y", { + router <- build_routes(cookie_key) + request <- local_POST_dataset_request(data.frame(biomarker = "ab", + day = c("01-01-2024", + "02-01-2024"), + value = 1:10), + "testdataset", + cookie = cookie) + upload_res <- router$call(request) + expect_equal(upload_res$status, 200) + + res <- router$call(make_req("GET", + "/dataset/testdataset/trace/ab/", + HTTP_COOKIE = cookie)) + + expect_equal(res$status, 200) + body <- jsonlite::fromJSON(res$body) + data <- body$data + expect_equal(unlist(data$raw[1, "x"]), rep(c("2024-01-01", "2024-01-02"), 5)) +}) + +test_that("y-m-d", { + router <- build_routes(cookie_key) + request <- local_POST_dataset_request(data.frame(biomarker = "ab", + day = c("2024-01-13", + "2024-01-14"), + value = 1:10), + "testdataset", + cookie = cookie) + upload_res <- router$call(request) + expect_equal(upload_res$status, 200) + + res <- router$call(make_req("GET", + "/dataset/testdataset/trace/ab/", + HTTP_COOKIE = cookie)) + + expect_equal(res$status, 200) + body <- jsonlite::fromJSON(res$body) + data <- body$data + expect_equal(unlist(data$raw[1, "x"]), rep(c("2024-01-13", "2024-01-14"), 5)) +}) + +test_that("y/d/m", { + router <- build_routes(cookie_key) + request <- local_POST_dataset_request(data.frame(biomarker = "ab", + day = c("2024/14/01", + "2024/15/01"), + value = 1:10), + "testdataset", + cookie = cookie) + upload_res <- router$call(request) + expect_equal(upload_res$status, 200) + + res <- router$call(make_req("GET", + "/dataset/testdataset/trace/ab/", + HTTP_COOKIE = cookie)) + + expect_equal(res$status, 200) + body <- jsonlite::fromJSON(res$body) + data <- body$data + expect_equal(unlist(data$raw[1, "x"]), rep(c("2024-01-14", "2024-01-15"), 5)) +}) diff --git a/tests/testthat/test-upload.R b/tests/testthat/test-upload.R index a338036..d8ec997 100644 --- a/tests/testthat/test-upload.R +++ b/tests/testthat/test-upload.R @@ -46,6 +46,23 @@ test_that("uploading dataset with duplicate name returns 400", { "testdataset already exists. Please choose a unique name for this dataset.") }) +test_that("uploading dataset with invalid xcol values returns 400", { + router <- build_routes(cookie_key) + request <- local_POST_dataset_request(data.frame(biomarker = "ab", + day = c("a", "b"), + value = 1), + "testdataset", + cookie = cookie) + res <- router$call(request) + expect_equal(res$status, 400) + + res <- router$call(request) + body <- jsonlite::fromJSON(res$body) + validate_failure_schema(res$body) + expect_equal(body$errors[1, "detail"], + "Invalid x column values: these should be numbers or dates in a standard format.") +}) + test_that("can upload dataset with different xcol", { router <- build_routes(cookie_key) request <- local_POST_dataset_request(data.frame(biomarker = "ab", From a6096dde0615ab881f9f6951825cc1dfd48c5acb Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Mon, 16 Sep 2024 12:38:32 +0100 Subject: [PATCH 10/10] lint --- R/api.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/api.R b/R/api.R index 7db4591..c18485c 100644 --- a/R/api.R +++ b/R/api.R @@ -48,7 +48,9 @@ target_post_dataset <- function(req, res) { if (suppressWarnings(all(is.na(as.numeric(file_body[, xcol]))))) { xtype <- "date" - suppressWarnings({file_body[, xcol] <- parse_date(file_body[, xcol])}) + suppressWarnings({ + file_body[, xcol] <- parse_date(file_body[, xcol]) + }) if (all(is.na(file_body[, xcol]))) { res$status <- 400L msg <- paste("Invalid x column values:",