Skip to content

Commit

Permalink
Merge pull request #95 from umccr/hrd_sig_report
Browse files Browse the repository at this point in the history
umccrise: support S3 outputs
  • Loading branch information
pdiakumis authored Sep 13, 2023
2 parents 4436c11 + 045c9cb commit b90cd73
Show file tree
Hide file tree
Showing 7 changed files with 434 additions and 61 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
__pycache__/
*.py[cod]
.Rproj.user
.Rhistory

/nogit
/docs
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ export(multiqc_tidy_json)
export(portal_meta_read)
export(rdf2tab)
export(read)
export(s3_files_list_filter_relevant)
export(s3_search)
export(session_info_kable)
export(time_metrics_process)
export(tso_rmd)
Expand Down
101 changes: 101 additions & 0 deletions R/s3.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#' List Relevant Files In AWS S3 Directory
#'
#' Lists relevant files in an AWS S3 directory.
#'
#' @param s3dir GDS directory.
#' @param pattern Pattern to further filter the returned file type tibble.
#' @param page_size The size of each page to get in the AWS service call (def: 1000).
#' @param max_items The total number of items to return in the command’s output (def: 1000).
#' @param presign Include presigned URLs (def: FALSE).
#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).
#'
#' @return A tibble with path, date, file size, file type, and presigned URL if requested.
#' @examples
#' \dontrun{
#' s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables"
#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE)
#' }
#' @export
s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) {
assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign))
pattern <- pattern %||% ".*" # keep all recognisable files by default
b <- sub("s3://(.*?)/.*", "\\1", s3dir)
p <- sub("s3://(.*?)/(.*)", "\\2", s3dir)
cmd <- glue(
"aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ",
"--max-items {max_items} --page-size {page_size}"
)
l <- system(cmd, intern = TRUE)
j <- jsonlite::fromJSON(l)
assertthat::assert_that("Contents" %in% names(j))
d <- j[["Contents"]] |>
tibble::as_tibble() |>
dplyr::mutate(
path = glue("s3://{b}/{.data$Key}"),
date_utc = .data$LastModified,
size = fs::as_fs_bytes(.data$Size)
) |>
dplyr::rowwise() |>
dplyr::mutate(
bname = basename(.data$path),
type = purrr::map_chr(.data$bname, match_regex)
) |>
dplyr::ungroup() |>
dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
dplyr::select("path", "date_utc", "size", "type")

if (presign) {
d <- d |>
dplyr::rowwise() |>
dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |>
dplyr::ungroup()
}
d
}

s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) {
p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE)
p
}

#' Search AWS S3 Objects
#'
#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3
#' bucket.
#'
#' @param pat Pattern to search for (e.g. 'multiqc_data.json').
#' @param rows Max number of rows to return.
#'
#' @return Tibble with S3 path, object size, date modified, id, unique hash.
#'
#' @examples
#' \dontrun{
#' pat <- "qc_summary.tsv.gz"
#' s3_search(pat, 10)
#' }
#' @export
s3_search <- function(pat, rows) {
au_tz <- "Australia/Melbourne"
utc_tz <- "UTC"
base_url <- "https://api.portal.prod.umccr.org/iam/s3"
url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}"))
awscurl_cmd <- glue(
"awscurl '{url1}' ",
"--header 'Accept: application/json'"
)
message(glue("Running {awscurl_cmd}"))
j <- system(awscurl_cmd, intern = TRUE)
date_fmt <- "%Y-%m-%dT%H:%M:%S"
d <- j |>
jsonlite::fromJSON() |>
purrr::pluck("results") |>
tibble::as_tibble()
d |>
dplyr::mutate(
date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt),
date_aest = lubridate::with_tz(.data$date1, tz = au_tz),
path = glue("s3://{bucket}/{key}"),
size = fs::as_fs_bytes(.data$size)
) |>
dplyr::select("path", "size", "date_aest", "id", "unique_hash")
}
Loading

0 comments on commit b90cd73

Please sign in to comment.