Skip to content

Commit

Permalink
bug fixes, new issues with opusreader2
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Feb 14, 2024
1 parent ddfa4b1 commit 46bab97
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 23 deletions.
21 changes: 16 additions & 5 deletions MIR/collect-wavenumber-metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,35 @@ source('../code/snapshot-preparation/snapshot-functions.R')
base.path <- 'E:/temp/MIR_work/processed-collections'

## paths to full collection
# as of 2023-01-16 there are 1594 collections
# as of 2023-01-16 there are 1594 collections (including some non-public)
# as of 2024-02-14: 1387 public collections
f <- list.files(base.path, full.names = TRUE)

# 2023-01-07: 1594 collections
length(f)

## testing: ok
# z <- wavenumberMetadata(f[1])
# str(z)

## TODO: not robust to NULL list elements

## collection metadata by collection/sample/integer wn-sequence
# ~ 2 minutes
plan(multisession)

system.time(
z <- future_map(f, .progress = TRUE, .f = wavenumberMetadata)
z <- future_map(f, .progress = TRUE, .f = safely(wavenumberMetadata))
)

plan(sequential)

# test for error conditions
e <- whochsapply(z, '[', 'error')
which(!sapply(e, is.null))

f[1054]




# flatten
z <- do.call('rbind', z)
row.names(z) <- NULL
Expand Down Expand Up @@ -113,6 +121,9 @@ nrow(x <- z[z$wnID == 3, ])
knitr::kable(x[, 1:2], row.names = FALSE)
range(as.numeric(strsplit(x$wn[1], split = ',', fixed = TRUE)[[1]]))

## cleanup
rm(list = ls())
gc(reset = TRUE)



Expand Down
2 changes: 2 additions & 0 deletions MIR/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
##


# WD
setwd('MIR')

# build a set of RDS, one per spectra collection
# save to temporary folder for later use
Expand Down
22 changes: 15 additions & 7 deletions MIR/pre-process-collections.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,41 @@ dir.create(proc.path, recursive = TRUE)


## paths to full collection
# as of 2023-01-16 there are 1594 collections
# as of 2023-01-16 there are 1594 collections (including some non-public)
# as of 2024-02-14: 1387 public collections
p <- list.dirs(file.path(base.path, 'MIR_Library'), recursive = TRUE, full.names = TRUE)

# remove the top-level directory
p <- p[-1]
# also remove the _INSTRUCTIONS dir
p <- p[-c(1:2)]



## pre-process OPUS files
# result is a set of RDS files, by spectra collection
# ~ 15 minutes
# ~ 13 minutes
# WD is the bottle-neck
# files cannot be open in OPUS software

# test working as expected: OK
# processOpusCollection(p[1], .output = proc.path)

# missing `ab` list element: 'e:/MIR/MIR_Library/C2019USNJ085/'

plan(multisession)

# writes RDS, no output here
# writes RDS, result is a list of collections with no usable data
system.time(
.trash <- future_map(p, .progress = TRUE, .f = processOpusCollection, .output = proc.path)
e <- future_map(p, .progress = TRUE, .f = processOpusCollection, .output = proc.path)
)

plan(sequential)

# done
# keep track of errors


## cleanup
rm(list = ls())
gc(reset = TRUE)



57 changes: 46 additions & 11 deletions code/snapshot-preparation/snapshot-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,45 @@ processOpusCollection <- function(.collection, .output) {
# spectra ID
.sID <- gsub(pattern = '.0', replacement = '', x = basename(.files), fixed = TRUE)

## TODO: data_only = TRUE bug
## -> https://github.com/spectral-cockpit/opusreader2/issues/104

## TODO: warnings:
## -> In get_meta_utc_datetime(timestamp) : NAs introduced by coercion

# load all spectra objects in collection
# there may be cases with no usable data (why?)
# result is an empty list
x <- opusreader2::read_opus(.files, data_only = TRUE, parallel = FALSE, progress_bar = FALSE)
x <- opusreader2::read_opus(.files, data_only = FALSE, parallel = FALSE, progress_bar = FALSE)

## TODO: review with data_only = FALSE
## TODO: something wrong with C2019USNJ085/*

# find bad files / parse error (?)
idx <- which(sapply(x, length) < 1)
if(length(idx) > 0) {
# keep track / warn
.msg <- sprintf("unusable .0 file: %s [%s]", .sID[idx], .cID)
message(.msg)

# remove spectra + sample ID
x <- x[-idx]
.sID <- .sID[-idx]

}

# find spectra missing 'ab' element
idx <- which(sapply(x, function(i) {
is.null(i$ab)
}))

if(length(idx) > 0) {
# keep track / warn
.msg <- sprintf("unusable .0 file: %s [%s]", .sID[idx], .cID)
message(.msg)
.msg <- sprintf("missing `ab` file: %s [%s]", .sID[idx], .cID)
message(paste(.msg, collapse = '\n'))

# remove spectra + sample ID
x <- x[-idx]
.sID <- .sID[-idx]
}

# extract components from OPUS object
Expand All @@ -37,18 +61,29 @@ processOpusCollection <- function(.collection, .output) {

})

# keep track of collection ID
attr(.res, 'collection') <- .cID

# keep track of sample IDs in the spectra list
names(.res) <- .sID

# keep track of collection ID
attr(.res, 'collection') <- .cID

# save to RDS
.file <- sprintf('%s.rds', file.path(.output, .cID))
saveRDS(.res, file = .file)
# test for empty set
# all files in collection are invalid
# result is NULL
if(length(.res) < 1) {
.res <- NULL

# return to collection ID to calling function
return(.cID)

# do not save RDS
} else {
# everything is fine
# save to RDS
.file <- sprintf('%s.rds', file.path(.output, .cID))
saveRDS(.res, file = .file)
}

## TODO: return error status
}


Expand Down

0 comments on commit 46bab97

Please sign in to comment.