diff --git a/MIR/collect-wavenumber-metadata.R b/MIR/collect-wavenumber-metadata.R index c2b7ebd..9a5674f 100644 --- a/MIR/collect-wavenumber-metadata.R +++ b/MIR/collect-wavenumber-metadata.R @@ -8,27 +8,35 @@ source('../code/snapshot-preparation/snapshot-functions.R') base.path <- 'E:/temp/MIR_work/processed-collections' ## paths to full collection -# as of 2023-01-16 there are 1594 collections +# as of 2023-01-16 there are 1594 collections (including some non-public) +# as of 2024-02-14: 1387 public collections f <- list.files(base.path, full.names = TRUE) -# 2023-01-07: 1594 collections -length(f) - ## testing: ok # z <- wavenumberMetadata(f[1]) # str(z) +## TODO: not robust to NULL list elements ## collection metadata by collection/sample/integer wn-sequence # ~ 2 minutes plan(multisession) system.time( - z <- future_map(f, .progress = TRUE, .f = wavenumberMetadata) + z <- future_map(f, .progress = TRUE, .f = safely(wavenumberMetadata)) ) plan(sequential) +# test for error conditions +e <- whochsapply(z, '[', 'error') +which(!sapply(e, is.null)) + +f[1054] + + + + # flatten z <- do.call('rbind', z) row.names(z) <- NULL @@ -113,6 +121,9 @@ nrow(x <- z[z$wnID == 3, ]) knitr::kable(x[, 1:2], row.names = FALSE) range(as.numeric(strsplit(x$wn[1], split = ',', fixed = TRUE)[[1]])) +## cleanup +rm(list = ls()) +gc(reset = TRUE) diff --git a/MIR/main.R b/MIR/main.R index 875f0dc..13f84cd 100644 --- a/MIR/main.R +++ b/MIR/main.R @@ -3,6 +3,8 @@ ## +# WD +setwd('MIR') # build a set of RDS, one per spectra collection # save to temporary folder for later use diff --git a/MIR/pre-process-collections.R b/MIR/pre-process-collections.R index 19c1982..0df21cd 100644 --- a/MIR/pre-process-collections.R +++ b/MIR/pre-process-collections.R @@ -17,33 +17,41 @@ dir.create(proc.path, recursive = TRUE) ## paths to full collection -# as of 2023-01-16 there are 1594 collections +# as of 2023-01-16 there are 1594 collections (including some non-public) +# as of 2024-02-14: 1387 public collections p <- list.dirs(file.path(base.path, 'MIR_Library'), recursive = TRUE, full.names = TRUE) # remove the top-level directory -p <- p[-1] +# also remove the _INSTRUCTIONS dir +p <- p[-c(1:2)] + ## pre-process OPUS files # result is a set of RDS files, by spectra collection -# ~ 15 minutes +# ~ 13 minutes # WD is the bottle-neck -# files cannot be open in OPUS software # test working as expected: OK # processOpusCollection(p[1], .output = proc.path) +# missing `ab` list element: 'e:/MIR/MIR_Library/C2019USNJ085/' plan(multisession) -# writes RDS, no output here +# writes RDS, result is a list of collections with no usable data system.time( - .trash <- future_map(p, .progress = TRUE, .f = processOpusCollection, .output = proc.path) + e <- future_map(p, .progress = TRUE, .f = processOpusCollection, .output = proc.path) ) plan(sequential) -# done +# keep track of errors + + +## cleanup +rm(list = ls()) +gc(reset = TRUE) diff --git a/code/snapshot-preparation/snapshot-functions.R b/code/snapshot-preparation/snapshot-functions.R index df85b49..ed2de8b 100644 --- a/code/snapshot-preparation/snapshot-functions.R +++ b/code/snapshot-preparation/snapshot-functions.R @@ -10,21 +10,45 @@ processOpusCollection <- function(.collection, .output) { # spectra ID .sID <- gsub(pattern = '.0', replacement = '', x = basename(.files), fixed = TRUE) + ## TODO: data_only = TRUE bug + ## -> https://github.com/spectral-cockpit/opusreader2/issues/104 + + ## TODO: warnings: + ## -> In get_meta_utc_datetime(timestamp) : NAs introduced by coercion + # load all spectra objects in collection # there may be cases with no usable data (why?) # result is an empty list - x <- opusreader2::read_opus(.files, data_only = TRUE, parallel = FALSE, progress_bar = FALSE) + x <- opusreader2::read_opus(.files, data_only = FALSE, parallel = FALSE, progress_bar = FALSE) + + ## TODO: review with data_only = FALSE + ## TODO: something wrong with C2019USNJ085/* # find bad files / parse error (?) idx <- which(sapply(x, length) < 1) if(length(idx) > 0) { + # keep track / warn + .msg <- sprintf("unusable .0 file: %s [%s]", .sID[idx], .cID) + message(.msg) + # remove spectra + sample ID x <- x[-idx] .sID <- .sID[-idx] - + } + + # find spectra missing 'ab' element + idx <- which(sapply(x, function(i) { + is.null(i$ab) + })) + + if(length(idx) > 0) { # keep track / warn - .msg <- sprintf("unusable .0 file: %s [%s]", .sID[idx], .cID) - message(.msg) + .msg <- sprintf("missing `ab` file: %s [%s]", .sID[idx], .cID) + message(paste(.msg, collapse = '\n')) + + # remove spectra + sample ID + x <- x[-idx] + .sID <- .sID[-idx] } # extract components from OPUS object @@ -37,18 +61,29 @@ processOpusCollection <- function(.collection, .output) { }) + # keep track of collection ID + attr(.res, 'collection') <- .cID # keep track of sample IDs in the spectra list names(.res) <- .sID - # keep track of collection ID - attr(.res, 'collection') <- .cID - - # save to RDS - .file <- sprintf('%s.rds', file.path(.output, .cID)) - saveRDS(.res, file = .file) + # test for empty set + # all files in collection are invalid + # result is NULL + if(length(.res) < 1) { + .res <- NULL + + # return to collection ID to calling function + return(.cID) + + # do not save RDS + } else { + # everything is fine + # save to RDS + .file <- sprintf('%s.rds', file.path(.output, .cID)) + saveRDS(.res, file = .file) + } - ## TODO: return error status }