diff --git a/DESCRIPTION b/DESCRIPTION index e4842f3..12d5410 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,18 @@ Package: cleanEHR Type: Package Title: The Critical Care Clinical Data Processing Tools -Version: 0.1 -Date: 2017-01-30 -Author: Sinan Shi, David Pérez-Suárez, Steve Harris, Niall MacCallum, David +Version: 0.2 +Author: Sinan Shi, David Perez-Suarez, Steve Harris, Niall MacCallum, David Brealey, Mervyn Singer, James Hetherington Maintainer: Sinan Shi -Description: A toolset to deal with the Critical Care Health Informatics - Collaborative dataset. It is created to address various data reliability and - accessibility problems of electronic healthcare records (EHR). It provides a - unique platform which enables data manipulation, transformation, reduction, - anonymisation, cleaning and validation. +Description: An electronic health care record (EHR) data cleaning and processing + platform. It focus on heterogeneous high resolution longitudinal data. It works with + Critical Care Health Informatics Collaborative (CCHIC) dataset. It is + created to address various data reliability and accessibility problems of + EHRs as such. Depends: - R (>= 3.1.0) + R (>= 3.1.0) +BugReports: https://github.com/CC-HIC/cleanEHR/issues License: GPL-3 LinkingTo: Rcpp Suggests: @@ -22,12 +22,10 @@ Imports: XML, yaml, Rcpp, - methods, - knitr, ggplot2, + methods, pander, - stats, - utils + knitr VignetteBuilder: knitr URL: https://github.com/CC-HIC/cleanEHR, http://www.hic.nihr.ac.uk RoxygenNote: 5.0.1 @@ -35,7 +33,8 @@ Collate: 'RcppExports.R' 'ccRecord.R' 'ccTable.R' - 'create2dclean.R' + 'cchic_xml.R' + 'data.R' 'data.quality.report.R' 'deltaTime.R' 'demographics.R' @@ -43,13 +42,6 @@ Collate: 'filter.missingness.R' 'filter.range.R' 'imputation.R' - 'pipeline.R' - 'reallocateTime.R' - 'selectTable.R' - 'sql_demographic.R' 'stdid.R' - 'summary.R' - 'unique.spell.R' 'utilities.R' - 'xml2ccdata.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index f7a0b1c..d7112ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,42 +3,37 @@ export(StdId) export(as.number) export(ccRecord) -export(ccRecord_subset_files) +export(ccd_demographic_spell) +export(ccd_demographic_table) +export(ccd_unique_spell) export(code2stname) -export(create.cctable) export(create2dclean) +export(create_cctable) export(data.quality.report) export(data.quality.report.brc) export(deltaTime) export(demg.distribution) export(demographic.data.completeness) -export(demographic.patient.spell) -export(episode.graph) -export(extractInfo) +export(extract_info) export(file.summary) export(for_each_episode) export(getEpisodePeriod) -export(getfilter) export(icnarc2diagnosis) export(is.demographic) export(is.drugs) export(is.laboratory) export(is.physiology) export(lenstay) +export(long2stname) export(lookup.items) export(new.episode) export(physio.distribution) -export(reallocateTimeRecord) export(samplerate2d) -export(selectTable) export(site.info) -export(sql.demographic.table) export(stname2code) export(stname2longname) export(table1) export(total.data.point) -export(unique_spell) -export(update_database) export(which.classification) export(xml.file.duration.plot) export(xml.site.duration.plot) @@ -47,16 +42,36 @@ export(xmlTime2POSIX) exportClasses(ccEpisode) exportClasses(ccRecord) exportClasses(ccTable) +exportMethods("+") exportMethods("[") exportMethods("[[") -import(Rcpp) +exportMethods(plot) +exportMethods(subset) import(XML) import(data.table) -import(ggplot2) -import(knitr) import(methods) -import(pander) -import(parallel) import(yaml) importFrom(Rcpp,evalCpp) +importFrom(ggplot2,aes_string) +importFrom(ggplot2,annotate) +importFrom(ggplot2,element_blank) +importFrom(ggplot2,element_text) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) +importFrom(ggplot2,geom_density) +importFrom(ggplot2,geom_line) +importFrom(ggplot2,geom_point) +importFrom(ggplot2,geom_segment) +importFrom(ggplot2,geom_vline) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggtitle) +importFrom(ggplot2,labs) +importFrom(ggplot2,scale_colour_manual) +importFrom(ggplot2,scale_x_datetime) +importFrom(ggplot2,theme) +importFrom(ggplot2,xlab) +importFrom(ggplot2,ylab) +importFrom(knitr,knit) +importFrom(pander,pander) +importFrom(pander,panderOptions) useDynLib(cleanEHR) diff --git a/R/RcppExports.R b/R/RcppExports.R index b885a80..a73a77e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,6 +2,6 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 reallocateTime_ <- function(d, t_discharge, frequency) { - .Call('cleanEHR_reallocateTime_', PACKAGE = 'cleanEHR', d, t_discharge, frequency) + .Call('_cleanEHR_reallocateTime_', PACKAGE = 'cleanEHR', d, t_discharge, frequency) } diff --git a/R/ccRecord.R b/R/ccRecord.R index 7d29bf6..c1d5132 100644 --- a/R/ccRecord.R +++ b/R/ccRecord.R @@ -1,14 +1,14 @@ #' @import XML #' @import data.table #' @import yaml -#' @import Rcpp +#' @importFrom Rcpp evalCpp NULL -#' The S3 class which holds all the CCHIC patient record - served as a database. +#' The S4 class which holds all the CCHIC patient record - served as a database. #' -#' @description ccRecord is a class to hold the raw episode data parsed directly from XML or -#' CSV files. +#' @description ccRecord is a class to hold the raw episode data parsed directly +#' from XML or CSV files. #' @field nepisodes is an integer number indicates the total number of episode #' the record is holding. #' @field dmgtb a data.table containing all the demographic information of each @@ -47,7 +47,7 @@ ccRecord <- setClass("ccRecord", infotb=data.table(), dmgtb=data.table())) -#' The S3 class which holds data of a single episode. +#' The S4 class which holds data of a single episode. #' #' @field site_id character string. Site ID, if presented, otherwise "NA". #' @field episode_id character string. Episode ID, if presented, otherwise "NA". @@ -80,79 +80,6 @@ ccEpisode <- setClass("ccEpisode", parse_time=as.POSIXct(NA), data=list())) - -#' Adding one ccEpisode object to ccRecord object. -#' -#' @param rec ccRecord-class -#' @param episode ccEpisode-class -#' @return ccRecord object -add.episode.to.record <- function(rec, episode) { - rec@episodes[[length(rec@episodes) + 1]] <- episode - index.record(rec) -} - -#' Adding a list of ccEpisode to ccRecord -#' -#' @description Adding a list of one or multiple ccEpisode objects to a -#' ccRecord object, the information table (infotb) will be updated automatically. -#' It is the more efficient way to add multiple ccEpisode objects. See -#' add.episode.to.record() for just adding one ccEpisode. -#' @param rec ccRecord -#' @param lst a list of ccEpisode objects -#' @return ccRecord -add.episode.list.to.record <- function(rec, lst) { - for(i in seq(length(lst))) - rec@episodes[[length(rec@episodes) + 1]] <- lst[[i]] - index.record(rec) -} - -#' Combine two ccRecord objects -#' -#' Combine two ccRecord objects and re-calculate the infortb -#' -#' @param rec1 ccRecord object -#' @param rec2 ccRecord object -#' @return ccRecord object -add.record.to.record <- function(rec1, rec2) { - rec1@episodes <- append(rec1@episodes, rec2@episodes) - index.record(rec1) -} - - -#' Adding a list of ccEpisode objects to a ccRecord -#' -#' @param e1 ccRecord-class -#' @param e2 A list of ccEpisode objects -#' @return ccRecord-class -setMethod('+', c("ccRecord", "list"), - function(e1, e2) {add.episode.list.to.record(e1, e2)} - ) - -#' Adding one ccEpisode object to a ccRecord -#' -#' @param e1 ccRecord-class -#' @param e2 ccEpisode-class -#' @return ccRecord-class -setMethod('+', c("ccRecord", "ccEpisode"), - function(e1, e2) {add.episode.to.record(e1, e2)}) - -#' Combine two ccRecord objects -#' -#' @param e1 ccRecord-class -#' @param e2 ccRecord-class -#' @return ccRecord-class -setMethod('+', c("ccRecord", "ccRecord"), - function(e1, e2) {add.record.to.record(e1, e2)} - ) - -#' Adding nothing to a ccRecord object. -#' -#' @param e1 ccRecord-class -#' @param e2 NULL -setMethod('+', c("ccRecord", "NULL"), - function(e1, e2) return(e1)) - - index.record <- function(rec) { retrieve_all <- function(x) { .simple.data.frame(list(site_id = x@site_id, @@ -185,6 +112,53 @@ index.record <- function(rec) { rec } +#' Adding a list of ccEpisode to ccRecord +#' +#' @description Adding a list of one or multiple ccEpisode objects to a +#' ccRecord object, the information table (infotb) will be updated automatically. +#' It is the more efficient way to add multiple ccEpisode objects. +#' @param e1 ccRecord +#' @param e2 a list of ccEpisode objects +#' @return ccRecord +#' @exportMethod + +setMethod('+', c("ccRecord", "list"), + function(e1, e2) { + for(i in seq(length(e2))) + e1@episodes[[length(e1@episodes) + 1]] <- e2[[i]] + index.record(e1) + + + }) + +#' Adding one ccEpisode object to a ccRecord +#' +#' @param e1 ccRecord-class +#' @param e2 ccEpisode-class +#' @return ccRecord-class +setMethod('+', c("ccRecord", "ccEpisode"), + function(e1, e2) { + e1@episodes[[length(e1@episodes) + 1]] <- e2 + index.record(e1) + }) + +#' Combine two ccRecord objects +#' +#' @param e1 ccRecord-class +#' @param e2 ccRecord-class +#' @return ccRecord-class +setMethod('+', c("ccRecord", "ccRecord"), + function(e1, e2) { + e1@episodes <- append(e1@episodes, e2@episodes) + index.record(e1) + }) + +#' Adding nothing to a ccRecord object and return the original ccRecord +#' +#' @param e1 ccRecord-class +#' @param e2 NULL +setMethod('+', c("ccRecord", "NULL"), + function(e1, e2) return(e1)) + #' Create a new episode #' @@ -202,7 +176,9 @@ index.record <- function(rec) { #' new.episode(eps) #' #' @export -new.episode <- function(lt=list(), parse_file="NA", parse_time=as.POSIXct(NA)) { +new.episode <- function(lt=list(), + parse_file="NA", + parse_time=as.POSIXct(NA)) { eps <- ccEpisode() eps@data <- lt @@ -219,7 +195,7 @@ new.episode <- function(lt=list(), parse_file="NA", parse_time=as.POSIXct(NA)) { slot.name <- c("t_admission", "t_discharge") for (i in seq(slot.name)) slot(eps, slot.name[i]) <- - as.POSIXct(xmlTime2POSIX(lt[[stname2code(short.name[i])]], allow=T)) + as.POSIXct(xmlTime2POSIX(lt[[stname2code(short.name[i])]], allow=TRUE)) eps@parse_file <- parse_file eps@parse_time <- parse_time @@ -236,7 +212,7 @@ for_each_episode <- function(record, fun) { } -#' Subseting a ccRecord object and return a list of ccEpisode objects. +#' Subsetting a ccRecord object and return a list of ccEpisode objects. #' #' @param x ccRecord-class #' @param i integer vector @@ -265,7 +241,7 @@ setMethod("[", "ccRecord", ccRecord() + eplst }) -#' Create a ccRecord subset via selected sites. +#' Create a ccRecord subsetting via selected sites. #' #' @param x ccRecord-class #' @param i character vector which contains site_ids, e.g. c("Q70", "Q70W") @@ -284,21 +260,139 @@ setMethod("[", signature(x="ccRecord", i="character"), ccRecord() + eplst }) +#' Get a subset of episodes from ccRecord. +#' +#' @param r ccRecord-class +#' @param f character a vector of XML file names - see ccRecord: parse_file +#' @return ccRecord-class +#' @exportMethod subset +setGeneric("subset", function(r, f) { + standardGeneric("subset") +}) + -#' Subset episodes from the specified XML files. +#' Get a subset of episodes that have the same from ccRecord . #' -#' @param ccd ccRecord object -#' @param files character a vector of XML file names - see ccRecord: parse_file +#' @param r ccRecord-class +#' @param f character a vector of XML file names - see ccRecord: parse_file #' @return ccRecord object -#' @export ccRecord_subset_files -ccRecord_subset_files <- function(ccd, files) { - ind <- ccd@infotb[ccd@infotb$parse_file %in% files]$index +setMethod("subset", signature(r="ccRecord", f="character"), +function(r, f) { + ind <- r@infotb[r@infotb$parse_file %in% f]$index if (length(ind) == 0) { return(ccRecord()) } eplst <- list() for (ep in ind) { - eplst[[length(eplst) + 1]] <- ccd@episodes[[ep]] + eplst[[length(eplst) + 1]] <- r@episodes[[ep]] } ccRecord() + eplst +}) + +episode_graph <- function(ep, items=NULL) { + t_ad <- ep@t_admission + t_dc <- ep@t_discharge + + + if (is.null(items)) + items <- c("h_rate", "spo2", "bilirubin", "platelets", "pao2_fio2", "gcs_total") + + all.drugs <- names(which(class.dict_code[names(ITEM_REF)] == "Drugs")) + used.drugs <- code2stname(all.drugs[all.drugs %in% names(ep@data)]) + + classification.dictionary <- sapply(ITEM_REF, function(x) x$Classification1) + + + create.long.table <- function(ep, items) { + items <- data.table(items=items, + code=stname2code(items), + longname=stname2longname(items), + class=classification.dictionary[stname2code(items)]) + units <- unit.dict[items$code] + units[is.na(units)] <- "" + items$longname <- paste0(items$longname, "\n", units) + + ltb <- list() + for (i in seq(nrow(items))) { + if (is.null(ep@data[[items[i]$code]])) + ltb[[i]] <- data.frame() + else + ltb[[i]] <- data.frame(ep@data[[items[i]$code]], + item=items[i]$longname) + } + ltb <- rbindlist(ltb, use.names=TRUE, fill=TRUE) + if (is.numeric(ltb$time)) + ltb$time <- t_ad + ltb$time * 60 * 60 + ltb$item2d <- as.numeric(ltb$item2d) + return(ltb) + } + + physio.tb <- create.long.table(ep, items) + physio.tb <- data.frame(physio.tb, + catg1=physio.tb$item, + catg2="Physiology Data") + drug.tb <- create.long.table(ep, used.drugs) + + drug.tb <- data.frame(drug.tb, catg1="Drugs", + catg2=drug.tb$item) + + + tb <- rbindlist(list(physio.tb, drug.tb), fill=TRUE, use.names=TRUE) + + + ggp <- ggplot(tb, aes_string(x="time", y="item2d", group="item", + colour="catg2")) + geom_line(colour="#1E506C") + + geom_point(size=1) + + facet_grid(catg1 ~., scales="free_y") + + geom_vline(xintercept = as.numeric(t_ad), colour="#D1746F") + + geom_vline(xintercept = as.numeric(t_dc), colour="#D1746F") + + scale_colour_manual(values=c("#1E506C", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#FFFFFF"), + name=paste0(ep@episode_id, "_", ep@site_id, "\n", + icnarc2diagnosis(ep@data[[stname2code('RAICU1')]]), "\n\n")) + + theme(legend.title = element_text(size=8), + legend.text = element_text(size=8)) + + labs(x="", y="") + + + + graphics::plot(ggp) + #"#1E506C""#D1746F" + invisible(tb) } + +#' Individual episode chart +#' +#' Create an individual episode chart for its diagnosis, drugs and physiological +#' variables. Diagnosis and drugs are always included, while the user can +#' select other longitudinal data. +#' @param r ccEpisode-class +#' @param v short name of longitudinal data. While v is not given, the chart +#' will only display h_rate, spo2, bilirubin, platelets, pao2_fio2, gcs_total. +#' @return a table of selected vars of an episode +#' @exportMethod plot +#' @examples +#' \dontrun{ +#' plot(ccd@episodes[[1]]) # plot first episode with default variables. +#' plot(ccd@episodes[[1]], "h_rate") # plot first episode heart rate +#' } +setGeneric("plot", function(r, v) { + standardGeneric("plot") +}) + +#' Episode chart +#' +#' @param r ccEpisode-class +#' @param v character +setMethod("plot", signature(r="ccEpisode", v="character"), +function(r, v){ + episode_graph(r, v) +}) + + +#' Episode chart default fields +#' +#' @param r ccEpisode-class +setMethod("plot", signature(r="ccEpisode", v="missing"), +function(r) { + episode_graph(r) +}) diff --git a/R/ccTable.R b/R/ccTable.R index c56b5aa..8fa20d8 100644 --- a/R/ccTable.R +++ b/R/ccTable.R @@ -1,70 +1,64 @@ -#' Rearrange and clean the critical care record into a 2D table. -#' -#' Data rearranging and major data cleaning processes will be performed under -#' the ccTable structre. It holds the original record (ccRecord), the dirty table -#' (torigin) clean table (tclean) and various data quality information (dquality). -#' Various data filters can also be found within the ccTable class. +#' Process the EHR data in table format #' +#' ccRecord data are re-arranged into tables where the columns stands for +#' data fields (e.g. heart rate, blood pressure) and the rows stands for +#' each data record within a unique cadence. See ccTable_create_cctable. +#' ccTable is the data processing platform. It stores both original data +#' and processed data alongside with the process details. It also contains +#' various commonly used data filters. #' @field record ccRecord. #' @field conf the YAML style configuration. -#' @field torigin the original wide data table. -#' @field tclean the wide data table after cleaning processes. +#' @field torigin the original data table. +#' @field tclean the data table after cleaning processes. #' @field dfilter list contains data filtering information. -#' @field dquality list contains data quality. +#' @field dquality list contains data quality. #' @field summary list #' @field base_cadence the base cadence is specified in hours #' @include ccRecord.R #' @examples #' rec <- ccRecord() -#' cctable <- create.cctable(rec, freq=1) +#' cctable <- create_cctable(rec, freq=1) #' cctable <- cctable$clean() #' #table <- cctable$tclean #' @exportClass ccTable ccTable <- setRefClass("ccTable", - fields=c( - record="ccRecord", - conf="list", - torigin="data.table", - tclean="data.table", - dfilter="list", - dquality="list", - summary="list", - base_cadence="numeric", - .rindex="data.table", - .epindex="data.table", - items="character")) + fields=c( + record="ccRecord", + conf="list", + torigin="data.table", + tclean="data.table", + dfilter="list", + dquality="list", + summary="list", + base_cadence="numeric", + .rindex="data.table", + .epindex="data.table", + items="character")) ccTable$methods( -show = function() { -# panderOptions("table.split.table", 150) - - cat("$tclean", "\n") - print(.self$tclean) - cat("Data entry (origin) = ", nrow(.self$torigin), "\n") - uniepisode <- .self$torigin[,1,by=c("episode_id", "site")] - cat("Episode number (origin) = ", nrow(uniepisode), "\n") - - -# if (!is.null(.self$tclean) & nrow(.self$tclean) != 0) { -# uniepisode <- .self$tclean[,1,by=c("episode_id", "site")] -# cat("Data entry (clean) = ", nrow(.self$tclean), "\n") -# uniepisode <- .self$tclean[,1,by=c("episode_id", "site")] -# cat("Episode number (clean) = ", nrow(uniepisode), "\n") -# .self$dfilter[['#missingness']].show() -# } -# else -# cat("no cleaning# data can be found.\n") - - -}) - -#' construct function of ccTable object + show = function() { + cat("$tclean", "\n") + #print(.self$tclean) + cat("Data entry (origin) = ", nrow(.self$torigin), "\n") + uniepisode <- .self$torigin[,1,by=c("episode_id", "site")] + cat("Episode number (origin) = ", nrow(uniepisode), "\n") + cat("The base cadence is ", .self$base_cadence, " hour.\n") + }) + +#' Create a ccTable object +#' +#' Re-arrange the ccRecord object to table format where each column stands +#' for a variable and each row a record data point. The number of rows will +#' depend on the sampling frequency set in this function. If the original data +#' has a higher recording frequency than the set frequency (freq), the closest +#' data point will be taken. It is suggested the `freq` should not be set +#' lower than the maximum recording frequency in the original dataset. #' @param rec ccRecord #' @param conf either the path of YAML configuration file or the configuration -#' structure in list. -#' @param freq the data cadence in hour. -#' @return ccTable object -#' @export create.cctable -create.cctable <- function(rec, freq, conf=NULL) { +#' @param freq a unique sampling frequency (in hours) for all variables. e.g. if freq is set to +#' 1, each row in ccTable will represent a record of one hour. +#' @return ccTable +#' @export +create_cctable <- function(rec, conf=NULL, freq=1) { if (is.null(conf)) conf <- ITEM_REF else { @@ -73,155 +67,439 @@ create.cctable <- function(rec, freq, conf=NULL) { } cct <- ccTable(record=rec, conf=conf) - cct$create.table(freq) + cct$create_table(freq) return(cct) } -#' get the dfilter -#' @param dq can be either dqaulity table or torigin -#' @param criterion should be a function to give T/F values of each entry. -#' @export getfilter -getfilter <- function(dq, criterion) { - if (ncol(dq) > 2) { - keys <- dq[, c("site", "episode_id"), with=FALSE] - dq[, c("site", "episode_id"):=NULL] - # updating range entry with true/false values - dq <- dq[, Map(criterion, .SD, names(.SD))] - # adding site and episode_id columns. - entry <- data.table(keys, dq) - episode <- entry[, - all(unlist(.SD), na.rm=TRUE), - by=c("site", "episode_id")] - setnames(episode, c("site", "episode_id", "select_index")) - return(list(entry=entry, episode=episode)) - } - else return(NULL) -} +#' Create a ccTable object +#' +#' This is a member function of ccTable-class. Using create_cctable is a safer and +#' easier way to create the ccTable. See create_cctable. +#' @name ccTable_create_cctable +NULL +ccTable$methods( + create_table = function(freq){ + "Create a table contains the selected items in the conf with a given + frequency (in hour)" + .self$items <- names(.self$conf) + .self$torigin <- ccd_select_table(record=record, items_opt=items, freq=freq) + if (nrow(.self$torigin) != 0) { + setkey(.self$torigin, "site", "episode_id") + .self$tclean <- .self$torigin + setkey(.self$torigin, "site", "episode_id") + .self$base_cadence <- freq + .self$.rindex <- .self$torigin + for(i in .self$items) .self$.rindex[[i]] <- TRUE + + .self$.epindex <- .self$torigin[, TRUE, by=c("site", "episode_id")] + setnames(.self$.epindex, c("site", "episode_id", "index")) + } else + .self$torigin <- data.table(site=character(), + episode_id=character(), + time=integer()) + }) ccTable$methods( - create.table = function(freq){ - "Create a table contains the selected items in the conf with a given - frequency (in hour)" - .self$items <- names(.self$conf) - .self$torigin <- selectTable(record=record, items_opt=items, freq=freq) - if (nrow(.self$torigin) != 0) { - setkey(.self$torigin, "site", "episode_id") - .self$tclean <- .self$torigin - setkey(.self$torigin, "site", "episode_id") - .self$base_cadence <- freq - - .self$.rindex <- .self$torigin - for(i in .self$items) .self$.rindex[[i]] <- TRUE - - .self$.epindex <- .self$torigin[, TRUE, by=c("site", "episode_id")] - setnames(.self$.epindex, c("site", "episode_id", "index")) - } else - .self$torigin <- data.table(site=character(), - episode_id=character(), - time=integer()) -}) + update.entry = function(){ + for (i in .self$items) + .self$tclean[[i]][!.self$.rindex[[i]]] <- NA + }) +ccTable$methods( + update.episode = function(){ + sep <- .self$.epindex[index==TRUE] + .self$tclean <- merge(.self$tclean, sep, by=c("site", "episode_id")) + .self$tclean[["index"]] <- NULL + }) +#' Apply all the setup filters. +#' +#' Once filters are applied, the processed data will be stored in tclean. Note, +#' running filtering function before apply_filters is necessary. This function +#' will have no effect on tclean if no filter is ran prior. +#' Filters will decide to preserve or remove particular entries or episodes. +#' @param warnings logical value to indicate more or less messages with an +#' default value TRUE. +#' @name ccTable_apply_filters +#' @examples +#' \dontrun{ +#' tb <- create_cctable(ccd, conf, 1) +#' tb$range_filter() +#' tb$apply_filter() # apply only the range filter ragardless of the conf. +#' } +NULL ccTable$methods( - update.entry = function(){ - for (i in .self$items) - .self$tclean[[i]][!.self$.rindex[[i]]] <- NA - }) + apply_filters = function(warnings=TRUE) { + "Apply all filters specified in the configuration to update the clean + table (tclean)" + spec2function <- function(item.name, filter.name) { + spec <- .self$conf[[item.name]][[filter.name]]$apply + spec <- as.character(as.vector(spec)) + switch(spec, + "drop_entry"=.self$drop_entry, + "drop_episode"=.self$drop_episode, + "NA"=function(nmitem, dq){}, + "NULL"=function(nmitem, dq){}, + stop("functions for applying filters can only be 'drop_entry' or 'drop_episode'. ")) + } + + ops <- strsplit(grep('apply', names(unlist(.self$conf)), value=TRUE), "[.]") + for (i in ops) { + item <- i[1] + filter <- i[2] + tryCatch(spec2function(item, filter)(item, + .self$dfilter[[filter]]), + error = function(e) { + if (is.null(.self$dfilter[[filter]])) { + if (warnings) + warning(paste(item, "filter", filter, + "has been specified in the configuration but has not been ran.")) + } + else { + cat(paste(item, filter, "\n")) + stop(e) + } + }) + } + .self$update.entry() + .self$update.episode() + }) ccTable$methods( - update.episode = function(){ - sep <- .self$.epindex[index==TRUE] - .self$tclean <- merge(.self$tclean, sep, by=c("site", "episode_id")) - .self$tclean[["index"]] <- NULL - }) + drop_entry = function(nmitem, dq){ + .self$.rindex[[nmitem]] <- + .self$.rindex[[nmitem]] & dq$entry[[nmitem]] + }) ccTable$methods( - apply.filters = function(warnings=T) { - "Apply all filters specified in the configuration to update the clean - table (tclean)" - ops <- strsplit(grep('apply', names(unlist(.self$conf)), value=TRUE), "[.]") - for (i in ops) { - item <- i[1] - filter <- i[2] - tryCatch(.self$spec2function(item, filter)(item, - .self$dfilter[[filter]]), - error = function(e) { - if (is.null(.self$dfilter[[filter]])) { - if (warnings) - warning(paste(item, "filter", filter, - "has been specified in the configuration but has not been ran.")) - } - else { - cat(paste(item, filter, "\n")) - stop(e) - } - } - ) - } - .self$update.entry() - .self$update.episode() - }) + drop_episode = function(nmitem, dq){ + .self$.epindex[["index"]] <- + .self$.epindex[["index"]] & dq$episode[["select_index"]] + }) +#' Reload the YAML configuration file +#' +#' Note, this function will also reset all the operations and +#' remove the tclean. +#' @name ccTable_reload_conf +#' @param conf full path of the YAML configuration file or the parsed config list. +#' @examples +#' \dontrun{ +#' tb$reload_conf("REF.yaml") +#' } +NULL ccTable$methods( - drop_entry = function(nmitem, dq){ - .self$.rindex[[nmitem]] <- - .self$.rindex[[nmitem]] & dq$entry[[nmitem]] - }) + reload_conf = function(conf) { + "reload yaml configuration." + if (is.character(conf)) + .self$conf <- yaml.load_file(conf) + if (!is.list(conf)) + stop("conf must be a list or the full path to a YAML file.") + else + .self$conf <- conf + .self$reset() + }) +#' Reset the ccTable +#' +#' Restore the object to its initial status. All the filters, quality and the +#' cleaned table will be removed. +#' @name ccTable_reset +NULL ccTable$methods( - drop_episode = function(nmitem, dq){ - .self$.epindex[["index"]] <- - .self$.epindex[["index"]] & dq$episode[["select_index"]] - }) + reset = function() { + .self$dfilter <- list() + .self$dquality <- list() + .self$tclean <- .self$torigin + }) +#' Export the clean table as a CSV file +#' +#' Export tclean as a CSV file. +#' @name ccTable_export_csv +#' @param file the full path of the output CSV file. +NULL ccTable$methods( - spec2function = function(item.name, filter.name) { - spec <- .self$conf[[item.name]][[filter.name]]$apply - spec <- as.character(as.vector(spec)) - switch(spec, - "drop_entry"=.self$drop_entry, - "drop_episode"=.self$drop_episode, - "NA"=function(nmitem, dq){}, - "NULL"=function(nmitem, dq){}, - stop("functions for applying filters can only be 'drop_entry' or 'drop_episode'. ")) -}) + export_csv = function(file=NULL) { + "Export the cleaned table to a CSV file." + if (is.null(file)) + return(.self$tclean) + write.csv(.self$tclean, file=file) + }) +#' Apply all the filters +#' +#' All the filters in configuration will be applied to create the +#' clean dataset. The filters include range, categories, missingness, +#' no_data. +#' @name ccTable_clean +#' @examples +#' \dontrun{ +#' tb <- create_cctable(ccd, conf, 1) +#' tb$clean() +#' } +NULL ccTable$methods( - filter.null = function(items=c("episode_id", "site")) { - "remove the entire episode when any of the selected items is NULL" - for (i in items) - .self$tclean <- .self.tclean[i != "NULL"] -}) + clean = function() { + if (nrow(.self$torigin) != 0 ) { + .self$filter_range() + .self$filter_categories() + .self$filter_missingness() + .self$filter_nodata() + .self$apply_filters() + } + else + warning("The original table is NULL, hence no cleaning process has been performed.") + }) -ccTable$methods( - reload.conf = function(file) { - "reload yaml configuration." - .self$conf=yaml.load_file(file) -}) +itemsToDataFrame <- function(ep, items, period_length, freq) { + listmatrix <- list() + time <- seq(0, period_length, freq) + listmatrix[["time"]] <- time -ccTable$methods( - export.csv = function(file=NULL) { - "Export the cleaned table to a CSV file." - if (is.null(file)) - return(.self$tclean) + for (i in items) { + if (length(ep@data[[i]]) == 1) { + listmatrix[[i]] <- rep(ep@data[[i]], length(time)) + } + else { + if ("time" %in% names(ep@data[[i]])) { + new <- reallocateTime(ep@data[[i]], period_length, freq) + listmatrix[[i]] <- new$val + if ("meta" %in% names(ep@data[[i]])) + listmatrix[[paste(i, "meta", sep=".")]] <- new$meta + } + else + listmatrix[[i]] <- rep("NA", length(time)) + } + } + return(listmatrix) +} - write.csv(.self$tclean, file=file) -}) +#' Create the table for ccTable from ccRecord +#' +#' @param record ccRecord +#' @param items_opt character vectors. Items (HIC code) selected in item_opt are optional items, which will be automatically +#' filled when item is missing. +#' @param items_obg obligatory items that is obligatory; Any episode that does not contain +#' item in this vector will be removed. +#' @param freq numeric cadence in hour. +#' @param return_list logical if TRUE return as a list. +#' @return data.table +ccd_select_table <- function(record, items_opt=NULL, items_obg=NULL, freq, + return_list=FALSE) { + all_items <- c(items_opt, items_obg) + if (is.null(all_items)) + stop('both items_opt and items_obg are NULL') -ccTable$methods( - clean = function() { - if (nrow(.self$torigin) != 0 ) { - .self$filter.ranges() - .self$filter.category() - .self$filter.missingess() - .self$filter.nodata() - .self$apply.filters() + env <- environment() + lt <- list() + stopifnot(is.list(env$lt)) # totally redundent, just to avoid an anonying + # note says env is assigned but not used! + for_each_episode(record, + function(ep) { + if (all(items_obg %in% names(ep@data))) { + result <- list() + period_length <- getEpisodePeriod(ep) + # getEpisodePeriod will return NULL when no 2D + # data been found. + if (!is.null(period_length)) { + if (period_length > 0 ) { + result <- append(result, + itemsToDataFrame(ep, all_items, + period_length, + freq)) + nlength <- length(result[["time"]]) + result[["site"]] <- rep(ep@site_id, nlength) + result[["episode_id"]] <- rep(ep@episode_id, nlength) + env$lt[[length(lt) + 1]]<- .simple.data.frame(result) + } + } + } + }) + if (return_list) + return(lt) + + # fill is true because meta data column can be missing. + dt <- rbindlist(lt, fill=TRUE) + + + # Adding missing meta columns to keep the 2d wide consistent. + code.has.meta <- names(unlist(sapply(ITEM_REF, function(x) x$NHICmetaCode))) + for (i in all_items) { + meta.code <- paste(i, "meta", sep=".") + if (i %in% code.has.meta & !(meta.code %in% names(dt))) { + dt[[meta.code]] <- rep("NA", nrow(dt)) } - else - warning("The original table is NULL, hence no cleaning process has been performed.") - }) + } + + # convert data type + for (i in all_items) + dt[[i]] <- suppressWarnings(.which.datatype(i)(as.character(dt[[i]]))) + + + return(dt) +} + +#' Clean table - low memory +#' +#' The cleaning process is specified by the YAML configuration. All the filters +#' presented in the configuration will be applied. It returns only the cleaned +#' data. However all the data quality information will be lost. This function +#' is useful when the memory is not sufficiently enough to hold all the +#' information. +#' @param record ccRecord +#' @param config the full path of the YAML configuration file +#' @param freq table cadence +#' @param nchunks integer number. The larger the nchunks the less memory +#' requirement. +#' @return A cleaned 2d wide table +#' @export create2dclean +create2dclean <- function(record, config, freq=1, nchunks=1) { + .create2dclean <- function(record, config, freq) { + dt.sofa <- create_cctable(rec=record, conf=config, freq=freq) + dt.sofa$filter_range() + dt.sofa$filter_categories() + dt.sofa$filter_missingness() + dt.sofa$filter.nodata() + dt.sofa$apply_filters() + return(dt.sofa) + } + + + if (is.character(config)) + config <- yaml.load_file(config) + + stopifnot(nchunks > 0 & nchunks < record@nepisodes) + + if (nchunks == 1) + return(.create2dclean(record, config, freq)$tclean) + + op.seq <- round(seq(1, record@nepisodes + 1, length.out=nchunks + 1)) + + tclean <- list() + + for (i in seq(length(op.seq) - 1)) { + rc <- record[seq(op.seq[i], op.seq[i+1] - 1)] + tclean[[i]] <- .create2dclean(rc, config, freq)$tclean + gc() + } + + tclean <- rbindlist(tclean, fill=TRUE) + return(tclean) +} + + + +#' @importFrom Rcpp evalCpp +#' @useDynLib cleanEHR +reallocateTime <- function(d, t_discharge, frequency) { + d_ <- d + stopifnot(any(names(d) == "time")) + stopifnot(any(names(d) == "item2d")) + stopifnot(class(d$time) == "numeric") + return(reallocateTime_(d_, t_discharge, frequency)) +} + + +findMaxTime <- function(episode) { + get2dTime <- function(episode){ + time_lt <- + lapply(episode@data, + function(item){ + if(length(item) > 1) { + if (!is.numeric(item$time)) + item$time <- + as.numeric(as.character(item$time)) + return(max(item$time)) + } + }) + tm <- unlist(time_lt) + tm + } + tm <- get2dTime(episode) + if (is.null(tm)) + return(NULL) + else + return(max(tm)) +} + + +#' Get the length of stay based on the first and the last data point. +#' +#' @param e ccEpisode object. +#' @param unit character string. Units in which the results are desired. Can be abbreviated. +#' @return length of stay +#' @export getEpisodePeriod +getEpisodePeriod <- function (e, unit="hours") { + # pseudo delta period, see addPseudoTime() + if (class(e@t_discharge)[1] == "numeric") + return(e@t_discharge) + + if (class(e@t_admission)[1] != "POSIXct") + tadm <- xmlTime2POSIX(as.character(e@t_admission), allow=TRUE) + else + tadm <- e@t_admission + if (class(e@t_discharge)[1] != "POSIXct") + tdisc <- xmlTime2POSIX(as.character(e@t_discharge), allow=TRUE) + else + tdisc <- e@t_discharge + + # The failure of POSIX conversion indicates that this episode is either + # anonymised or has a missing or incorrect value of discharge or admission + # time. + if (is.na(tadm) || is.na(tdisc)) + period_length <- findMaxTime(e) + else { + if (any(is.null(tdisc), is.null(tadm))) + period_length <- NULL + else + period_length <- as.numeric(tdisc - tadm, + units=unit) + } + # in cases that tdisc == tadm + if (!is.null(period_length)) { + if (period_length == 0) + period_length <- period_length + 1 + } + + if (is.null(period_length)) + warning("This episode does not have any time series data: ", + " episode_id = ", e@episode_id, + " nhs_number = ", e@nhs_number, + " pas_number = ", e@pas_number, + " period_length = ", period_length, "\n") + + + return(period_length) +} + +#' Propagate a numerical delta time interval record. +#' +#' @param record ccRecord +#' @param delta time frequency in hours +#' @details when discharge time and admission time are missing, the latest and +#' the earliest data time stamp will be used instead. +reallocateTimeRecord <- function(record, delta=0.5) { + reallocate.episode <- function(e) { + env <- environment() + # make sure admin and disc time is correct + period_length <- getEpisodePeriod(e) + if (period_length < 0) warning("period length < 0") + + # calling reallocateTime for each data item + new.episode(lapply(e@data, + function(d) { + if (length(d) > 1) { + return(reallocateTime(d, env$period_length, delta)) + } else + return(d) + })) + } + newdata <- for_each_episode(record, reallocate.episode) + return(ccRecord() + newdata) +} diff --git a/R/xml2ccdata.R b/R/cchic_xml.R similarity index 99% rename from R/xml2ccdata.R rename to R/cchic_xml.R index 045fdc8..a5d5fd4 100644 --- a/R/xml2ccdata.R +++ b/R/cchic_xml.R @@ -20,7 +20,7 @@ xmlEpisodeToList <- function(episode_node) { return() } - num.children = xmlSize(node) + num.children <- xmlSize(node) if(num.children == 0 ) { label <- xmlName(xmlParent(node)) if (is.null(.which.type(label))) { diff --git a/R/create2dclean.R b/R/create2dclean.R deleted file mode 100644 index fb831c6..0000000 --- a/R/create2dclean.R +++ /dev/null @@ -1,46 +0,0 @@ -.create2dclean <- function(record, config, freq) { - dt.sofa <- create.cctable(rec=record, conf=config, freq=freq) - dt.sofa$filter.ranges() - dt.sofa$filter.category() - dt.sofa$filter.missingness() - dt.sofa$filter.nodata() - dt.sofa$apply.filters() - return(dt.sofa) -} - -#' Create a 2D wide clean table - low memory -#' -#' The cleaning process is specified by the YAML configuration. All the filters -#' presented in the configuration will be applied. It returns only the cleaned -#' data. However all the data quality information will be lost. This function -#' is useful when the memory is not sufficiently enough to hold all the -#' information. -#' @param record ccRecord -#' @param config the full path of the YAML configuration file -#' @param freq table cadence -#' @param nchunks integer number. The larger the nchunks the less memory -#' requirement. -#' @return A cleaned 2d wide table -#' @export create2dclean -create2dclean <- function(record, config, freq=1, nchunks=1) { - if (is.character(config)) - config <- yaml.load_file(config) - - stopifnot(nchunks > 0 & nchunks < record@nepisodes) - - if (nchunks == 1) - return(.create2dclean(record, config, freq)$tclean) - - op.seq <- round(seq(1, record@nepisodes + 1, length.out=nchunks + 1)) - - tclean <- list() - - for (i in seq(length(op.seq) - 1)) { - rc <- record[seq(op.seq[i], op.seq[i+1] - 1)] - tclean[[i]] <- .create2dclean(rc, config, freq)$tclean - gc() - } - - tclean <- rbindlist(tclean, fill=TRUE) - return(tclean) -} diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..81c6473 --- /dev/null +++ b/R/data.R @@ -0,0 +1,25 @@ +#' This a reference table of NHIC data items. +#' +#' @name data.checklist +#' @docType data +#' @author Sinan Shi \email{s.shi@ucl.ac.uk} +#' @keywords data +NULL + +#' ICNARC diagnosis reference table +#' +#' @name icnarc_table +#' @references \url{https://www.icnarc.org/Our-Audit/Audits/Cmp/Resources/Icm-Icnarc-Coding-Method} +#' @docType data +#' @keywords data +NULL + + +#' Field reference table +#' +#' @name ITEM_REF +#' @docType data +#' @keywords data +NULL + + diff --git a/R/data.quality.report.R b/R/data.quality.report.R index 545426f..182ec25 100644 --- a/R/data.quality.report.R +++ b/R/data.quality.report.R @@ -10,33 +10,33 @@ #' @param site a vector of the site ids for the site specified report. #' @param pdf logical create the pdf version of the DQ report, #' otherwise stay in markdown format -#' @param file charcter a list of XML file origins. +#' @param file character a list of XML file origins. #' @param out character output path #' @export data.quality.report +#' @importFrom pander pander panderOptions +#' @importFrom knitr knit #' @examples #' \dontrun{data.quality.report(ccd, c("Q70", "C90"))} -#' @import knitr -#' @import pander -#' @import ggplot2 -data.quality.report <- function(ccd, site=NULL, file=NULL, pdf=T, out="report") { +data.quality.report <- function(ccd, site=NULL, file=NULL, pdf=TRUE, out="report") { # if (is.null(site) & is.null(file)) dbfull <- "YES" # else dbfull <- "NO" stopifnot(!(!is.null(site) & !is.null(file))) if (!is.null(site)) ccd <- ccd[site] - if (!is.null(file)) ccd <- ccRecord_subset_files(ccd, file) + if (!is.null(file)) ccd <- subset(ccd, file) if (dir.exists(out)) { - unlink(out, recursive=T) + unlink(out, recursive=TRUE) } dir.create(out) wd <- getwd() rptpath <- paste(path.package('cleanEHR'), "report", sep="/") - file.copy(dir(rptpath, full.names=T), out, recursive=T) + file.copy(dir(rptpath, full.names=TRUE), out, recursive=TRUE) write.report <- function() { + on.exit(setwd(wd)) setwd(out) dqpath <- "data_quality_report.Rmd" headerpath <- "listings-setup.tex" @@ -81,7 +81,7 @@ data.quality.report <- function(ccd, site=NULL, file=NULL, pdf=T, out="report") #' Oxford, and UCLH. #' @param path report export path #' @export data.quality.report.brc -data.quality.report.brc <- function(ccd, pdf=T, brc=NULL, path=NULL) { +data.quality.report.brc <- function(ccd, pdf=TRUE, brc=NULL, path=NULL) { if (!is.null(path)) dir.create(path) @@ -120,13 +120,15 @@ file.summary <- function(ccd) { #' Plot the XML duration in terms of sites. #' #' @param ccd ccRecord-class +#' @importFrom ggplot2 ggplot aes_string geom_segment annotate scale_x_datetime theme ggtitle xlab ylab geom_line geom_point facet_wrap facet_grid geom_vline scale_colour_manual element_text labs +#' @importFrom ggplot2 geom_density element_blank #' @export xml.site.duration.plot xml.site.duration.plot <- function(ccd) { tb <- copy(ccd@infotb) - tb <- tb[, list("minadm"=min(.SD[["t_admission"]], na.rm=T), - maxadm=max(.SD[["t_admission"]], na.rm=T), - mindis=min(.SD[["t_discharge"]], na.rm=T), - maxdis=max(.SD[["t_discharge"]], na.rm=T)), by="site_id"] + tb <- tb[, list("minadm"=min(.SD[["t_admission"]], na.rm=TRUE), + maxadm=max(.SD[["t_admission"]], na.rm=TRUE), + mindis=min(.SD[["t_discharge"]], na.rm=TRUE), + maxdis=max(.SD[["t_discharge"]], na.rm=TRUE)), by="site_id"] site_name <- apply((site.info()[tb$site_id, ][,1:2]), 1, function(x) paste(x, collapse="-")) tb[, site_name:=site_name] @@ -147,10 +149,10 @@ xml.site.duration.plot <- function(ccd) { #' @export xml.file.duration.plot xml.file.duration.plot <- function(ccd) { tb <- copy(ccd@infotb) - tb <- tb[, list(minadm=min(.SD[["t_admission"]], na.rm=T), - maxadm=max(.SD[["t_admission"]], na.rm=T), - mindis=min(.SD[["t_discharge"]], na.rm=T), - maxdis=max(.SD[["t_discharge"]], na.rm=T)), by="parse_file"] + tb <- tb[, list(minadm=min(.SD[["t_admission"]], na.rm=TRUE), + maxadm=max(.SD[["t_admission"]], na.rm=TRUE), + mindis=min(.SD[["t_discharge"]], na.rm=TRUE), + maxdis=max(.SD[["t_discharge"]], na.rm=TRUE)), by="parse_file"] ggplot(tb, aes_string(x="minadm", y="parse_file")) + geom_segment(aes_string(xend="maxdis", yend="parse_file"), color="gray", size=10) + annotate("text", x=tb$minadm+(tb$maxdis-tb$minadm)/2, @@ -170,7 +172,7 @@ txt.color <- function(x, color) { #' Create a demographic completeness table (in pander table) #' -#' @param demg data.table the demographic data table created by sql.demographic.table() +#' @param demg data.table the demographic data table created by ccd_demographic_table() #' @param names short name of selected items #' @param return.data logical return the table if TRUE #' @export demographic.data.completeness @@ -198,7 +200,7 @@ demographic.data.completeness <- function(demg, names=NULL, return.data=FALSE) { demg <- copy(demg) demg[, "index":=NULL] if (!is.null(names)) - demg <- demg[, names, with=F] + demg <- demg[, names, with=FALSE] cmplt <- apply(demg, 2, function(x) length(which(!(x=="NULL" | is.na(x))))) cmplt <- data.frame(cmplt) @@ -234,7 +236,7 @@ demographic.data.completeness <- function(demg, names=NULL, return.data=FALSE) { if (return.data) return(cmplt) - pander(as.data.frame(cmplt), style="rmarkdown", justify = c('left', 'center', "center", + pander::pander(as.data.frame(cmplt), style="rmarkdown", justify = c('left', 'center', "center", "center", 'left')) } @@ -243,7 +245,7 @@ demographic.data.completeness <- function(demg, names=NULL, return.data=FALSE) { #' @param cctb ccTable-class, see create.cctable(). #' @export samplerate2d samplerate2d <- function(cctb) { - sample.rate.table <- data.frame(fix.empty.names=T) + sample.rate.table <- data.frame(fix.empty.names=TRUE) # items are the columns before site. items <- names(cctb)[-c(grep("meta", names(cctb)), which(names(cctb) %in% @@ -261,7 +263,7 @@ samplerate2d <- function(cctb) { names(sample.rate.table) <- c("Item", "NHIC Code (NIHR_HIC_ICU_xxxx)", "Sample Period (hour)") - pander(as.data.frame(sample.rate.table), style="rmarkdown") + pander::pander(as.data.frame(sample.rate.table), style="rmarkdown") } @@ -271,26 +273,28 @@ samplerate2d <- function(cctb) { #' @param ccd ccRecord-class #' @export total.data.point total.data.point <- function(ccd) { - dp.physio <- - sum(unlist(for_each_episode(ccd, - function(x) - Reduce(sum, sapply(x@data, nrow))))) - dp.demg <- - sum(unlist(for_each_episode(ccd, - function(x) - Reduce(sum, sapply(x@data, nrow))))) - return(sum(dp.physio, dp.demg)) + get.rows <- function(x) { + n <- nrow(x) + if (is.null(n)) + n <- 1 + return(n) + } + + sum(unlist(for_each_episode(ccd, + function(x){ + sum(vapply(x@data, get.rows, 1)) + }))) } #' Produce the item specified table one. #' -#' @param demg demographic table created by sql.demographic.table() +#' @param demg demographic table created by ccd_demographic_table() #' @param names character string. Short names of data items, e.g. h_rate. #' @param return.data logical, FALSE: printing the pander table, TRUE: return the table but not print out the pander table. #' @return if return.data is TRUE, return data.table #' @export table1 table1 <- function(demg, names, return.data=FALSE) { - panderOptions('knitr.auto.asis', FALSE) + pander::panderOptions('knitr.auto.asis', FALSE) if (!return.data) cat(paste("\n## Table ONE\n")) @@ -303,7 +307,7 @@ table1 <- function(demg, names, return.data=FALSE) { cat(paste("\n###", ref$dataItem," - ", hicnum, "\n")) if (ref$Datatype %in% c("text", "list", "Logical", "list / Logical")) { stopifnot(!is.null(ref$category)) - nmref <- sapply(ref$category$levels, function(x) x) + nmref <- vapply(ref$category$levels, function(x) x, character(1)) r <- demg[, .N, by=name] level.name <- nmref[r[[name]]] r[, "nm":=level.name] @@ -320,27 +324,27 @@ table1 <- function(demg, names, return.data=FALSE) { if (return.data) return(tb) else - pander(as.data.frame(tb), style="rmarkdown") + pander::pander(as.data.frame(tb), style="rmarkdown") } for (i in names) table1.item(demg, i) - panderOptions('knitr.auto.asis', TRUE) + pander::panderOptions('knitr.auto.asis', TRUE) } #' demg.distribution #' Create a plot of the distribution of numerical demographic data. #' -#' @param demg ccRecord or demographic table created by sql.demographic.table() +#' @param demg ccRecord or demographic table created by ccd_demographic_table() #' @param names character vector of short names of numerical demographic data. #' @examples #' \dontrun{tdemg.distribution(ccd, "HCM")} #' @export demg.distribution demg.distribution <- function(demg, names) { if (class(demg) == "ccRecord") - demg <- sql.demographic.table(demg) + demg <- ccd_demographic_table(demg) for (nm in names) { ref <- ITEM_REF[[stname2code(nm)]] hicnum <- as.number(StdId(stname2code(nm))) diff --git a/R/deltaTime.R b/R/deltaTime.R index be1f03e..1ebe441 100644 --- a/R/deltaTime.R +++ b/R/deltaTime.R @@ -23,8 +23,9 @@ find.episode.time <- function(episode) { -#' convert calendar time data in a record to delta time comparing to the ICU +#' Convert calendar date-time to the time difference comparing to the ICU #' admission time. +#' #' @param record ccRecord #' @param pseudotime logical If pseudotime is set to be TRUE, then the #' admission and discharge time will be set as the earliest and latest data stamp diff --git a/R/demographics.R b/R/demographics.R index c7b77e4..b65cc7c 100644 --- a/R/demographics.R +++ b/R/demographics.R @@ -1,16 +1,101 @@ +#' Create the demographic tables, which includes all non-time-varying variables. +#' +#' @description The data type of each column is in its corresponding data +#' type. +#' +#' @param record ccRecord-class +#' @param dtype logical column will be type aware, else all in character. +#' @export +ccd_demographic_table <- function(record, dtype=TRUE) { + env <- environment() + demogls <- list() + stopifnot(is.list(env$demogls)) + all.demcode <- all.nhic.code("Demographic") + for_each_episode(record, + function(x){ + demog.data <- rep("NULL", length(all.demcode)) + names(demog.data) <- all.demcode + demog.data <- as.list(demog.data) + for(item in names(x@data)) { + if (length(x@data[[item]]) == 1) { + demog.data[[item]] <- x@data[[item]] + } + } + env$demogls[[length(env$demogls) + 1]] <- .simple.data.frame(demog.data) + }) + demogt <- rbindlist(demogls, fill=TRUE) + setnames(demogt, code2stname(names(demogt))) + + if (dtype) { + for (i in seq(ncol(demogt))){ + demogt[[i]] <- + .which.datatype(stname2code(names(demogt)[i]))(demogt[[i]]) + } + } + demogt[, "index":=seq(nrow(demogt))] + return(demogt) +} + + #' Calculate the length of stay in the ICU. #' #' Calculate the length of stay in the ICU and append it to the original demographic #' table. -#' @param demg data.table the demograhic table which should at least contain +#' @param demg data.table the demographic table which should at least contain #' column DAICU and DDICU #' @param units character The unit of lenstay column, by default the output will be in hours #' @return data.table It is the original data.table with lenstay column (in #' difftime) appended. #' @export lenstay lenstay <- function(demg, units="hours") { - len <- difftime(xmlTime2POSIX(demg$DDICU, allow=T), - xmlTime2POSIX(demg$DAICU, allow=T), + len <- difftime(xmlTime2POSIX(demg$DDICU, allow=TRUE), + xmlTime2POSIX(demg$DAICU, allow=TRUE), units=units) return(cbind(demg, lenstay = len)) } + +#' find the unique spell ID. +#' +#' @param rec ccRecord-class +#' @param duration integer hours +#' @return data.table contains spell id. +#' @export ccd_unique_spell +ccd_unique_spell <- function(rec, duration=2) { + tb <- rec@infotb + short.time.group <- function(sd) { + zeroday <- 0 + if (length(sd[[1]]) == 1) + return(zeroday) + dic <- sd$t_discharge[seq(length(sd$t_discharge)-1)] + adm <- sd$t_admission[2:length(sd$t_admission)] + + # 0 is a mark of first episode. In order to differentiate, I added 1e-7 to all + # the diff days, which give an error of less than 1 mins. + diffday <- c(zeroday, difftime(adm, dic, units="days") + 1e-7) + diffday[is.na(diffday)] <- 0 + diffday + } + setkey(tb, "pid", "t_admission", "t_discharge") + tb[, "diffday":=short.time.group(.SD), by="pid"] + + spell <- Reduce(sum, tb$diffday == 0 | tb$diffday > duration, accumulate=TRUE) + tb$spell <- spell + return(tb) +} + +#' Create demographic table with spell IDs +#' +#' @description same output like ccd_demographic_table but in +#' addition with a spell ID. +#' @param rec ccRecord +#' @param duration the maximum hours of transition period +#' @return data.table demographic table with spell ID in column spell +#' +#' @export +ccd_demographic_spell <- function(rec, duration=2) { + dmg <- ccd_demographic_table(rec) + us <- ccd_unique_spell(rec, duration) + us <- data.table(index=us$index, pid=us$pid, spell=us$spell) + dmg <- merge(dmg, us, by=c("index")) + return(dmg) +} diff --git a/R/filter.categorical.R b/R/filter.categorical.R index 5eb8800..526ad4e 100644 --- a/R/filter.categorical.R +++ b/R/filter.categorical.R @@ -1,11 +1,13 @@ #' @include ccTable.R item.criterion <- function(conf, criterion) { - names(conf)[sapply(conf, function(x) criterion %in% names(x))] + names(conf)[vapply(conf, + function(x) criterion %in% names(x), + logical(1))] } ccTable$methods( - get.categories = function() { + inspect_categories = function() { citems <- item.criterion(conf, "category") lapply(.self$conf[citems], function(x) names(x$category$levels)) }) @@ -17,24 +19,40 @@ ccTable$methods( } ) +#' Categorical data filter +#' +#' Categorical variables only allow a set of values to appear in the variable +#' . Due to various reasons, a categorical variable may contain values that are not +#' standard. The allowed values can be set in the YAML configuration while initialising +#' the ccTable (see ccTable-class, create_cctable). +#' In the following example, we can see how to set up the categorical filter +#' for the variable dead_icu (NIHR_HIC_ICU_0097) which only allows its value to +#' be A, D, E. +#' +#' @name ccTable_filter_categories +#' @examples +#' \dontrun{ +#' # Example for categorical filter setup in the YAML configuration +#' NIHR_HIC_ICU_0097: +#' category: +#' levels: +#' A: Alive +#' D: Dead +#' E: Alive - not discharged +#' apply: drop_entry +#' +#' # Run the filter on ccTable ct +#' ct$filter_categories() # run the filter +#' ct$apply_filters() # apply the filter and create the clean table +#' } +NULL ccTable$methods( - filter.category = function() { + filter_categories = function() { "Check individual entries if they are the in the categories specified in conf." data <- .self$get.data.column("category") - categories <- get.categories() + categories <- inspect_categories() in.category <- function(x, name) x %in% c(categories[[name]], "NA", NA) .self$dfilter$category <- getfilter(data, in.category) }) - - -ccTable$methods( - filter.nodata = function() { - data <- .self$get.data.column("nodata") - nodata <- function(x, ...) { - !all(x %in% c("NA", NA)) - } - .self$dfilter$nodata <- getfilter(data, nodata) - } -) diff --git a/R/filter.missingness.R b/R/filter.missingness.R index 6c17c74..9df37e2 100644 --- a/R/filter.missingness.R +++ b/R/filter.missingness.R @@ -1,6 +1,6 @@ #' @include ccTable.R ccTable$methods( - get.missingness = function() { + get_missingness = function() { miss_count <- function(tb_) { cmplt <- function(vec) { length(which(vec!="NA"))/length(vec) * 100 @@ -21,7 +21,7 @@ ccTable$methods( for (c in seq(missconf)) { col_name <- names(missconf[c]) colr <- missconf[[c]] - tbq <- selectTable(.self$record, items_opt=i, freq=colr) + tbq <- ccd_select_table(.self$record, items_opt=i, freq=colr) setkey(tbq, episode_id, site) oldnm <- names(.self$dquality[['missingness']]) .self$dquality[['missingness']] <- @@ -32,12 +32,23 @@ ccTable$methods( } }) + +#' Data missing filter +#' +#' Deal with data when insufficient data points are supported. There are +#' two key items to be set in the YAML configuration file. +#' 1) labels -- time interval. 2) accept_2d -- the accept present ratio. +#' So if we set the labels is 24, and accept_2d is 70. It means we accept +#' all the missing rate that is lower than 30% every 24 data points. +#' @name ccTable_filter_missingness +#' @param recount logical value. Recount the missingness if TRUE. +NULL ccTable$methods( - filter.missingness = function(recount=FALSE){ + filter_missingness = function(recount=FALSE){ "filter out the where missingness is too low." if (recount || is.null(.self$dquality[['missingness']]) || nrow(.self$dquality[['missingness']]) == 0) - .self$get.missingness() + .self$get_missingness() if (is.null(.self$tclean) || nrow(.self$tclean) == 0) .self$tclean <- .self$torigin @@ -57,4 +68,19 @@ ccTable$methods( with=FALSE], select_index) }) - +#' No data filter +#' +#' Remove the episode when a particular field is not presented. +#' It need to be set up in the YAML configuration file. +#' @name ccTable_filter_nodata +NULL +ccTable$methods( + filter_nodata = function() { + "Exclude episodes when no data is presented in certain fields" + data <- .self$get.data.column("nodata") + nodata <- function(x, ...) { + !all(x %in% c("NA", NA)) + } + .self$dfilter$nodata <- getfilter(data, nodata) + } +) diff --git a/R/filter.range.R b/R/filter.range.R index df7a913..87afedd 100644 --- a/R/filter.range.R +++ b/R/filter.range.R @@ -1,9 +1,10 @@ #' Check if the values of a vector v is in the given ranges. +#' #' @param v vector numeric #' @param range A string contains the numeric ranges in a form such as (low, #' up) for open range and [low, up] for close range. Multiple #' ranges should be separated by semi-columns which is equivalent to logical -#' OR. e.g. (low1, up1); (low2, up2) +#' OR e.g. (low1, up1); (low2, up2) inrange <- function(v, range) { funtxt <- function(r) { spr <- unlist(strsplit(r, ";")) @@ -33,12 +34,33 @@ inrange <- function(v, range) { return(cmpfunc(v)) } + +getfilter <- function(dq, criterion) { + kn <- c("site", "episode_id") + if (all(kn %in% names(dq)) & length(names(dq)) > 2) { + keys <- dq[, kn, with=FALSE] + k_ind <- !(names(dq) %in% kn) + dq <- dq[, names(dq)[k_ind], with=FALSE] + # updating range entry with true/false values + dq <- dq[, Map(criterion, .SD, names(.SD))] + # adding site and episode_id columns. + entry <- data.table(keys, dq) + episode <- entry[, + all(unlist(.SD), na.rm=TRUE), + by=c("site", "episode_id")] + setnames(episode, c("site", "episode_id", "select_index")) + return(list(entry=entry, episode=episode)) + } + else return(NULL) + } + + #' @include ccTable.R ccTable$methods( - get.ranges = function() { + inspect_range = function() { # Initialise with temp column to make sure that dquality has the same # number of rows than torigin - .self$dquality$range <- .self$torigin[,c('site', 'episode_id'), with=F] + .self$dquality$range <- .self$torigin[,c('site', 'episode_id'), with=FALSE] rgnum <- list('red'=1, 'amber'=2, 'green'=3) for(item_name in names(.self$conf)) { item <- .self$conf[[item_name]] @@ -58,17 +80,53 @@ ccTable$methods( } ) + +#' Numerical range filter +#' +#' Range filter can only be applied on numerical fields. +#' For those fields which requires a range filter to be applied, +#' one needs to set a series ranges from the broadest to the narrowest in +#' the YAML configuration. We can set three levels (labels) of ranges, red, amber, and +#' green. It is also OK to set only one range instead of three. +#' The range filter will first assign a label to every data entry. +#' +#' The range in the YAML configuration file can be (l, h), [l, h], (l, h], [h, l) +#' standing for close, open and half open intervals. +#' @param select the range label - "red", "amber", "green" +#' If I give "yellow to select, it means I only want the values which is +#' labeled as "yellow" to be in the clean table. +#' @name ccTable_filter_range +#' @examples +#' \dontrun{ +#' # YAML example +#' NIHR_HIC_ICU_0108: +#' shortName: h_rate +#' dataItem: Heart rate +#' range: +#' labels: +#' red: (0, 300) # broader +#' amber: (11, 170) +#' green: (60, 100) # narrower +#' apply: drop_entry +#' # apply range filter on ccTable ct +#' ct$filter_range("yellow") +#' ct$apply_filters +#'} ccTable$methods( - filter.ranges = function(select='red') { + filter_range = function(select='red') { rgnum <- list('red'=1, 'amber'=2, 'green'=3) + # dq can be either dqaulity table or torigin + # criterion should be a function to give T/F values of each entry. + inselectrange <- function(x, ...) { x >= rgnum[[select]] } if(is.null(.self$dquality$range) || nrow(.self$dquality$range) != nrow(.self$tclean)) - .self$get.ranges() + .self$inspect_range() - .self$dfilter$range <- getfilter(.self$dquality$range, inselectrange) + .self$dfilter$range <- getfilter(.self$dquality$range, + inselectrange) } ) diff --git a/R/imputation.R b/R/imputation.R index c161dfe..74b4f97 100644 --- a/R/imputation.R +++ b/R/imputation.R @@ -28,7 +28,7 @@ ccTable$methods( fun <- imwin[['fun']] lead <- imwin[['lead']] lag <- imwin[['lag']] - sd[[i]] <- interpolateVec(v=sd[[i]], lead=lead, lag=lag, FUN=fun, na.rm=T) + sd[[i]] <- interpolateVec(v=sd[[i]], lead=lead, lag=lag, FUN=fun, na.rm=TRUE) } } return(sd) diff --git a/R/pipeline.R b/R/pipeline.R deleted file mode 100644 index f1d0d97..0000000 --- a/R/pipeline.R +++ /dev/null @@ -1,143 +0,0 @@ -#' @import parallel -NULL - -find.new.xml.file <- function(xml.path) { - rdata.path <- paste(xml.path, ".database", sep="/") - dir.create(rdata.path, showWarnings=F) - - xml.file.name <- dir(xml.path) - - if (! all(grepl("(.xml|.XML|.partxml)$", xml.file.name))) - stop("file names in xml.path ", xml.path, " must end with suffix .xml or .partxml. ") - - parsed.pattern <- unique(sapply(strsplit(dir(rdata.path), ".xml"), function(x) x[1])) - if (length(parsed.pattern) == 0) { - return(xml.file.name) - } else { - all.xml.files <- - xml.file.name[!grepl(paste(parsed.pattern, collapse="|"), xml.file.name)] - return(all.xml.files) - } -} - - -#' Update the RData database -#' -#' Inject episode data from the newly added XML files to the RData database. -#' @param xml.path the path of the folder of which contains the XML files. -#' @param mc.cores number of processors to be applied for parallelisation. -#' @param quiet logical switch on/off of the progress bar. -#' @return ccRecord object -parse.new.xml <- function(xml.path, mc.cores=4, quiet=FALSE) { - files.to.parse <- find.new.xml.file(xml.path) - - db.collection <- mclapply(files.to.parse, - function(x) { - fxml <- paste(xml.path, x, sep="/") - frdata <- paste(xml.path, "/.database/", x, ".RData", sep="") - db <- xml2Data(fxml, quiet=quiet) - save(db, file=frdata) - return(db) - }, mc.cores=mc.cores) - - #' combine new data to a ccRecord - new.db <- ccRecord() - for (i in seq(db.collection)) { - new.db <- new.db + db.collection[[i]] - } - return(new.db) -} - - -#' Update the critical care database (RData) -#' -#' Parse critical care data from XML files and inject them into the RData -#' database. -#' -#' @param xml.path character the path of the folder of which contains the XML files. -#' @param mc.cores integer number of processors to be applied for parallelisation. -#' @param restart logical purge the previous database and restart parsing for all the XML files presented. -#' @param splitxml logical break down the XML files into chuncks. (Do it when the file is too big) -#' @param quiet logical show the progress bar if true -#' @export update_database -update_database <- function(xml.path, restart=FALSE, splitxml=FALSE, - mc.cores=4, quiet=FALSE) { - if (restart) - unlink('.database') - if (splitxml) { - break.down.xml(xml.path) - xml.path2 <- paste(xml.path, ".partxml", sep="/") - } - else - xml.path2 <- xml.path - - parse.new.xml(xml.path2, mc.cores, quiet) - - - alldata <- ccRecord() - files <- dir(paste(xml.path, ".database", sep="/"), - pattern="[^alldata.RData]", - full.names=TRUE) - db <- NULL - - for (i in files) { - load(i) - stopifnot(!is.null(db)) - alldata <- alldata + db - } - - save(alldata, file=paste(xml.path, ".database", "alldata.RData", sep="/")) - - invisible(alldata) -} - -parse.big.xml <- function(xml.path, mc.cores=4, quiet=TRUE, tmpd="/tmp", maxsize=3) { - - fparse<- paste(xml.path, find.new.xml.file(xml.path), sep="/") - fbig <- fparse[file.info(fparse)$size/1e9 > maxsize] - fbig_nopath <- sapply(strsplit(fbig, "/"), function(x) x[length(x)]) - - if (length(fbig > 0)) { - cat("Detected big files ... \n") - - tmpd <- paste(tmpd, "ccd_big_files", sep="/") - - if (dir.exists(tmpd)) unlink(tmpd, recursive=T) - dir.create(tmpd) -# file.copy(fbig, tmpd) -# stopifnot(all(fbig_nopath == dir(tmpd))) -# update_database(tmpd, mc.cores=mc.cores, quiet=quiet) - - cmd <- system.file("pipeline/break_into.sh", package="cleanEHR") - print(cmd) - for (i in fbig) system2(cmd, c(i, 3)) - - for(i in fbig_nopath) cat(" [+] ", i, "\n", sep="") - } - -} -break.down.xml <- function(xml.path) { - unlink(paste(xml.path, ".partxml", sep="/"), recursive=T) - partxml.dir <- paste(xml.path, ".partxml", sep="/") - dir.create(partxml.dir) - cmd <- paste(find.package('cleanEHR'), "pipeline/break_into.sh", sep="/") - # in the case of using testings, the original package layout is slightly - # different from the compiled one. - if (! file.exists(cmd)) - cmd <- paste(find.package('cleanEHR'), "inst/pipeline/break_into.sh", sep="/") - - newfile <- find.new.xml.file(xml.path) - if (length(newfile) > 0) - newfile <- paste(xml.path, newfile, sep="/") - else{ - return(1) - } - - for (f in newfile) { - system2(cmd, c(f, 100)) - partxml.file <- list.files(xml.path, pattern=".partxml", full.names=T) - file.copy(partxml.file, partxml.dir) - file.remove(partxml.file) - } - return(1) -} diff --git a/R/reallocateTime.R b/R/reallocateTime.R deleted file mode 100644 index 7706b44..0000000 --- a/R/reallocateTime.R +++ /dev/null @@ -1,108 +0,0 @@ -#' @importFrom Rcpp evalCpp -#' @useDynLib cleanEHR -reallocateTime <- function(d, t_discharge, frequency) { - d_ <- d - stopifnot(any(names(d) == "time")) - stopifnot(any(names(d) == "item2d")) - stopifnot(class(d$time) == "numeric") - return(reallocateTime_(d_, t_discharge, frequency)) -} - - -findMaxTime <- function(episode) { - get2dTime <- function(episode){ - time_lt <- - lapply(episode@data, - function(item){ - if(length(item) > 1) { - if (!is.numeric(item$time)) - item$time <- - as.numeric(as.character(item$time)) - return(max(item$time)) - } - }) - tm <- unlist(time_lt) - tm - } - tm <- get2dTime(episode) - if (is.null(tm)) - return(NULL) - else - return(max(tm)) -} - - -#' Get the length of stay based on the first and the last data point. -#' -#' @param e ccEpisode object. -#' @param unit character string. Units in which the results are desired. Can be abbreviated. -#' @return length of stay -#' @export getEpisodePeriod -getEpisodePeriod <- function (e, unit="hours") { - # pseudo delta period, see addPseudoTime() - if (class(e@t_discharge)[1] == "numeric") - return(e@t_discharge) - - if (class(e@t_admission)[1] != "POSIXct") - tadm <- xmlTime2POSIX(as.character(e@t_admission), allow=T) - else - tadm <- e@t_admission - if (class(e@t_discharge)[1] != "POSIXct") - tdisc <- xmlTime2POSIX(as.character(e@t_discharge), allow=T) - else - tdisc <- e@t_discharge - - # The failure of POSIX conversion indicates that this episode is either - # anonymised or has a missing or incorrect value of discharge or admission - # time. - if (is.na(tadm) || is.na(tdisc)) - period_length <- findMaxTime(e) - else { - if (any(is.null(tdisc), is.null(tadm))) - period_length <- NULL - else - period_length <- as.numeric(tdisc - tadm, - units=unit) - } - # in cases that tdisc == tadm - if (!is.null(period_length)) { - if (period_length == 0) - period_length <- period_length + 1 - } - - if (is.null(period_length)) - warning("This episode does not have any time series data: ", - " episode_id = ", e@episode_id, - " nhs_number = ", e@nhs_number, - " pas_number = ", e@pas_number, - " period_length = ", period_length, "\n") - - - return(period_length) -} - -#' Propagate a numerical delta time interval record. -#' @param record ccRecord -#' @param delta time frequency in hours -#' @details when discharge time and admission time are missing, the latest and -#' the earliest data time stamp will be used instead. -#' @export reallocateTimeRecord -reallocateTimeRecord <- function(record, delta=0.5) { - reallocate.episode <- function(e) { - env <- environment() - # make sure admin and disc time is correct - period_length <- getEpisodePeriod(e) - if (period_length < 0) warning("period length < 0") - - # calling reallocateTime for each data item - new.episode(lapply(e@data, - function(d) { - if (length(d) > 1) { - return(reallocateTime(d, env$period_length, delta)) - } else - return(d) - })) - } - newdata <- for_each_episode(record, reallocate.episode) - return(ccRecord() + newdata) -} diff --git a/R/selectTable.R b/R/selectTable.R deleted file mode 100644 index 8cc0614..0000000 --- a/R/selectTable.R +++ /dev/null @@ -1,89 +0,0 @@ -#' Create wide table from ccRecord -#' -#' @param record ccRecord -#' @param items_opt character vectors. Items (HIC code) selected in item_opt are optional items, which will be automatically -#' filled when item is missing. -#' @param items_obg obligatory items that is obligatory; Any episode that doesn't contain -#' item in this vector will be removed. -#' @param freq numeric cadence in hour. -#' @param return_list logical if TRUE return as a list. -#' @return data.table -#' @export selectTable -selectTable <- function(record, items_opt=NULL, items_obg=NULL, freq, - return_list=FALSE) { - all_items <- c(items_opt, items_obg) - if (is.null(all_items)) - stop('both items_opt and items_obg are NULL') - - env <- environment() - lt <- list() - stopifnot(is.list(env$lt)) # totally redundent, just to avoid an anonying - # note says env is assigned but not used! - for_each_episode(record, - function(ep) { - if (all(items_obg %in% names(ep@data))) { - result <- list() - period_length <- getEpisodePeriod(ep) - # getEpisodePeriod will return NULL when no 2D - # data been found. - if (!is.null(period_length)) { - if (period_length > 0 ) { - result <- append(result, - itemsToDataFrame(ep, all_items, - period_length, - freq)) - nlength <- length(result[["time"]]) - result[["site"]] <- rep(ep@site_id, nlength) - result[["episode_id"]] <- rep(ep@episode_id, nlength) - env$lt[[length(lt) + 1]]<- .simple.data.frame(result) - } - } - } - }) - if (return_list) - return(lt) - - # fill is true because meta data column can be missing. - dt <- rbindlist(lt, fill=TRUE) - - - # Adding missing meta columns to keep the 2d wide consistent. - code.has.meta <- names(unlist(sapply(ITEM_REF, function(x) x$NHICmetaCode))) - for (i in all_items) { - meta.code <- paste(i, "meta", sep=".") - if (i %in% code.has.meta & !(meta.code %in% names(dt))) { - dt[[meta.code]] <- rep("NA", nrow(dt)) - } - } - - # convert data type - for (i in all_items) - dt[[i]] <- suppressWarnings(.which.datatype(i)(as.character(dt[[i]]))) - - - return(dt) -} - -itemsToDataFrame <- function(ep, items, period_length, freq) { - listmatrix <- list() - time <- seq(0, period_length, freq) - - listmatrix[["time"]] <- time - - for (i in items) { - if (length(ep@data[[i]]) == 1) { - listmatrix[[i]] <- rep(ep@data[[i]], length(time)) - } - else { - if ("time" %in% names(ep@data[[i]])) { - new <- reallocateTime(ep@data[[i]], period_length, freq) - listmatrix[[i]] <- new$val - if ("meta" %in% names(ep@data[[i]])) - listmatrix[[paste(i, "meta", sep=".")]] <- new$meta - } - else - listmatrix[[i]] <- rep("NA", length(time)) - } - } - return(listmatrix) -} diff --git a/R/sql_demographic.R b/R/sql_demographic.R deleted file mode 100644 index 9e6416e..0000000 --- a/R/sql_demographic.R +++ /dev/null @@ -1,35 +0,0 @@ -#' Create demographic SQL tables. The data type of each column is in its -#' corresponding data type. -#' -#' @param record ccRecord-class -#' @param dtype logical column will be type aware, else all in character. -#' @export sql.demographic.table -sql.demographic.table <- function(record, dtype=TRUE) { - env <- environment() - demogls <- list() - stopifnot(is.list(env$demogls)) - all.demcode <- all.nhic.code("Demographic") - for_each_episode(record, - function(x){ - demog.data <- rep("NULL", length(all.demcode)) - names(demog.data) <- all.demcode - demog.data <- as.list(demog.data) - for(item in names(x@data)) { - if (length(x@data[[item]]) == 1) { - demog.data[[item]] <- x@data[[item]] - } - } - env$demogls[[length(env$demogls) + 1]] <- .simple.data.frame(demog.data) - }) - demogt <- rbindlist(demogls, fill=T) - setnames(demogt, code2stname(names(demogt))) - - if (dtype) { - for (i in seq(ncol(demogt))){ - demogt[[i]] <- - .which.datatype(stname2code(names(demogt)[i]))(demogt[[i]]) - } - } - demogt[, "index":=seq(nrow(demogt))] - return(demogt) -} diff --git a/R/stdid.R b/R/stdid.R index 082318a..81135f5 100644 --- a/R/stdid.R +++ b/R/stdid.R @@ -10,7 +10,7 @@ StdId <- setClass ("StdId", if (length(object@ids) != length(ids)) return("initialisation failure, as the standard ID pattern cannot be found.") else - object@ids = ids + object@ids <- ids return(TRUE) }) @@ -86,6 +86,18 @@ stname2longname <- function(stname) { return(code) } +#' Convert long names to short names. +#' +#' @param l long name such as "heart rate" +#' @return short name character such as "h_rate" +#' @export +long2stname <- function(l) { + l <- as.character(l) + s <- long2stname.dict[l] + s[is.na(s)] <- l[is.na(s)] + return(s) +} + #' Identify the classification - classification1 #' diff --git a/R/summary.R b/R/summary.R deleted file mode 100644 index 8eb87d1..0000000 --- a/R/summary.R +++ /dev/null @@ -1,81 +0,0 @@ -#' Individual episode graph -#' -#' Create an individual episode graph for its diagnosis, drugs and physiological -#' variables. Diagnosis and drugs are always included, while the user can -#' select other longitudinal data. -#' @param ccd ccRecord -#' @param eid character the episode index in the ccRecord -#' @param items character NIHC code of longitudinal data. -#' @export episode.graph -episode.graph <- function(ccd, eid=601, items=NULL) { - ep <- ccd[[eid]][[1]] - t_ad <- ep@t_admission - t_dc <- ep@t_discharge - - - if (is.null(items)) - items <- c("h_rate", "spo2", "bilirubin", "platelets", "pao2_fio2", "gcs_total") - - all.drugs <- names(which(class.dict_code[names(ITEM_REF)] == "Drugs")) - used.drugs <- code2stname(all.drugs[all.drugs %in% names(ep@data)]) - - classification.dictionary <- sapply(ITEM_REF, function(x) x$Classification1) - - - create.long.table <- function(ep, items) { - items <- data.table(items=items, - code=stname2code(items), - longname=stname2longname(items), - class=classification.dictionary[stname2code(items)]) - units <- unit.dict[items$code] - units[is.na(units)] <- "" - items$longname <- paste0(items$longname, "\n", units) - - ltb <- list() - for (i in seq(nrow(items))) { - if (is.null(ep@data[[items[i]$code]])) - ltb[[i]] <- data.frame() - else - ltb[[i]] <- data.frame(ep@data[[items[i]$code]], - item=items[i]$longname) - } - ltb <- rbindlist(ltb, use.names=T, fill=T) - if (is.numeric(ltb$time)) - ltb$time <- t_ad + ltb$time * 60 * 60 - ltb$item2d <- as.numeric(ltb$item2d) - return(ltb) - } - - physio.tb <- create.long.table(ep, items) - physio.tb <- data.frame(physio.tb, - catg1=physio.tb$item, - catg2="Physiology Data") - drug.tb <- create.long.table(ep, used.drugs) - - drug.tb <- data.frame(drug.tb, catg1="Drugs", - catg2=drug.tb$item) - - - tb <- rbindlist(list(physio.tb, drug.tb), fill=T, use.names=T) - - - ggp <- ggplot(tb, aes_string(x="time", y="item2d", group="item", - colour="catg2")) + geom_line(colour="#1E506C") + - geom_point(size=1) + - facet_grid(catg1 ~., scales="free_y") + - geom_vline(xintercept = as.numeric(t_ad), colour="#D1746F") + - geom_vline(xintercept = as.numeric(t_dc), colour="#D1746F") + - scale_colour_manual(values=c("#1E506C", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#FFFFFF"), - name=paste0(ep@episode_id, "_", ep@site_id, "\n", - icnarc2diagnosis(ep@data[[stname2code('RAICU1')]]), "\n\n")) + - theme(legend.title = element_text(size=8), - legend.text = element_text(size=8)) + - labs(x="", y="") - - - - graphics::plot(ggp) - #"#1E506C""#D1746F" - - return(tb) -} diff --git a/R/unique.spell.R b/R/unique.spell.R deleted file mode 100644 index ecd1c8e..0000000 --- a/R/unique.spell.R +++ /dev/null @@ -1,43 +0,0 @@ -#' find the unique spell ID. -#' -#' @param rec ccRecord-class -#' @param duration integer hours -#' @return data.table contains spell id. -#' @export unique_spell -unique_spell <- function(rec, duration=2) { - tb <- rec@infotb - short.time.group <- function(sd) { - zeroday <- 0 - if (length(sd[[1]]) == 1) - return(zeroday) - dic <- sd$t_discharge[1:length(sd$t_discharge)-1] - adm <- sd$t_admission[2:length(sd$t_admission)] - - # 0 is a mark of first episode. In order to differentiate, I added 1e-7 to all - # the diff days, which give an error of less than 1 mins. - diffday <- c(zeroday, difftime(adm, dic, units="days") + 1e-7) - diffday[is.na(diffday)] <- 0 - diffday - } - setkey(tb, "pid", "t_admission", "t_discharge") - tb[, "diffday":=short.time.group(.SD), by="pid"] - - spell <- Reduce(sum, tb$diffday == 0 | tb$diffday > duration, accumulate=T) - tb$spell <- spell - return(tb) -} - -#' Assign unique spell ID to the demographic table -#' -#' @param rec ccRecord -#' @param duration the maximum hours of transition period -#' @return data.table demographic table with spell ID in column spell -#' -#' @export demographic.patient.spell -demographic.patient.spell <- function(rec, duration=2) { - dmg <- sql.demographic.table(rec) - us <- unique_spell(rec, duration) - us <- data.table(index=us$index, pid=us$pid, spell=us$spell) - dmg <- merge(dmg, us, by=c("index")) - return(dmg) -} diff --git a/R/utilities.R b/R/utilities.R index 6510552..6630c41 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1,26 +1,15 @@ -#' This a reference table of NHIC data items. -#' -#' @name data.checklist -#' @docType data -#' @author Sinan Shi \email{s.shi@ucl.ac.uk} -#' @keywords data -NULL - - - - -#' get indexing tables for time label, time-wise value, meta data label, and -#' meta data. -#' @return list of vectors contains time.index, datat.index, meta.index, -#' datam.index -extractIndexTable <- function() { - info <- extractInfo() +# get indexing tables for time label, time-wise value, +# meta data label, and meta data. Return a list of vectors +# contains time.index, datat.index, meta.index, +# datam.index +extract_index_table <- function() { + info <- extract_info() checklist <- list() for(i in seq(info$nontime)) checklist[[info$nontime[i]]] <- "item1d" for(i in seq(info$time$idt)) - checklist[[info$time$idt[i]]] <-"time" + checklist[[info$time$idt[i]]] <- "time" for(i in seq(info$time$id)) checklist[[info$time$id[i]]] <- "item2d" for(i in seq(info$meta$meta)) @@ -57,7 +46,7 @@ extractIndexTable <- function() { 'date/time' = as.character, # They are hashed for now 'list / logical' = as.character) # what are they? - datatype = ITEM_REF[[id]]$Datatype + datatype <- ITEM_REF[[id]]$Datatype if (!is.null(datatype)){ if (exists(datatype, operations)){ return(operations[[datatype]]) @@ -73,11 +62,12 @@ whichIsCode <- function(nhic) { return(grepl(nhic, pattern="[0-9][0-9][0-9][0-9]")) } -#' extract information from data.checklist +#' Extract information from data.checklist +#' #' @return list of time [data.frame(id, idt)], meta [data.frame(id, idmeta)], #' nontime [numeric], MAX_NUM_NHIC -#' @export extractInfo -extractInfo <- function() { +#' @export +extract_info <- function() { index.time <- whichIsCode(data.checklist$NHICdtCode) index.meta <- whichIsCode(data.checklist$NHICmetaCode) @@ -117,9 +107,9 @@ extractInfo <- function() { #' @export lookup.items lookup.items <- function(keyword, style="grid") { - index1 <- grep(keyword, stname2longname.dict, ignore.case=T) - index2 <- grep(keyword, names(stname2longname.dict), ignore.case=T) - index3 <- grep(keyword, stname2code(names(stname2longname.dict)), ignore.case=T) + index1 <- grep(keyword, stname2longname.dict, ignore.case=TRUE) + index2 <- grep(keyword, names(stname2longname.dict), ignore.case=TRUE) + index3 <- grep(keyword, stname2code(names(stname2longname.dict)), ignore.case=TRUE) stn <- unique(names(stname2longname.dict[c(index1, index2, index3)])) @@ -182,22 +172,11 @@ site.info <- function(){ "R42"=c("Unknown", "Unknown", "Unknown", "Unknown"), "X90"=c("Addenbrooke's Hospital", "General/Liver/Transplant", "Cambridge", "John Farnham") ) - si <- data.frame(t(.simple.data.frame(si)), stringsAsFactors=F) + si <- data.frame(t(.simple.data.frame(si)), stringsAsFactors=FALSE) names(si) <- c("Hospital", "Unit", "Trust", "Comments") return(si) } - - -#' ICNARC diagnosis reference table -#' -#' @name icnarc -#' @references \url{https://www.icnarc.org/Our-Audit/Audits/Cmp/Resources/Icm-Icnarc-Coding-Method} -#' @docType data -#' @keywords data -NULL - - #' Convert ICNARC codes to diagnosis (text) #' #' NOTE: There are still ~600 code missing. see issue #133 @@ -212,8 +191,8 @@ icnarc2diagnosis <- function(icnarc, surgery=TRUE, levels=NULL) { if(!is.null(levels)) icnarc <- icnarc.breakdown(icnarc, digits=levels) else - icnarc <- sapply(lapply(strsplit(icnarc, split='[.]'), as.numeric), - function(x) paste(x, collapse=".")) + icnarc <- vapply(lapply(strsplit(icnarc, split='[.]'), as.numeric), + function(x) paste(x, collapse="."), character(1)) diag <- as.character(icnarc.dict[icnarc]) if (!surgery) diff --git a/R/zzz.R b/R/zzz.R index 9750c54..45ae7dd 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -24,23 +24,24 @@ return(new) } - stname <- sapply(ITEM_REF, function(x) x$shortName) + stname <- vapply(ITEM_REF, function(x) x$shortName, "") meta <- paste0(stname, ".meta") names(meta) <- paste0(names(stname), ".meta") code2stname.dict <- c(stname, meta) stname2code.dict <- reverse.name.value(code2stname.dict) - longnames <- sapply(ITEM_REF, function(x) x$dataItem) + longnames <- vapply(ITEM_REF, function(x) x$dataItem, "") stname2longname.dict <- longnames names(stname2longname.dict) <- stname + long2stname.dict <- reverse.name.value(stname2longname.dict) - #' classification dictionary: demographic, nurse, physiology, laboratory, drugs - class.dict_code <- sapply(ITEM_REF, function(x) x$Classification1) + # classification dictionary: demographic, nurse, physiology, laboratory, drugs + class.dict_code <- vapply(ITEM_REF, function(x) x$Classification1, "") class.dict_stname <- class.dict_code names(class.dict_stname) <- as.character(code2stname.dict[class.dict_stname]) - #' Time variable dictionary + # Time variable dictionary tval.dict_code <- data.checklist$NHICdtCode != "NULL" tval.dict_stname <- tval.dict_code names(tval.dict_stname) <- as.character(data.checklist$NHICcode) @@ -50,10 +51,11 @@ assign("code2stname.dict" , code2stname.dict , envir=env) assign("stname2code.dict" , stname2code.dict , envir=env) assign("stname2longname.dict" , stname2longname.dict , envir=env) + assign("long2stname.dict" , long2stname.dict , envir=env) assign("class.dict_code" , class.dict_code , envir=env) assign("class.dict_stname", class.dict_stname, envir=env) assign("tval.dict_code" , tval.dict_code , envir=env) assign("tval.dict_stname" , tval.dict_stname , envir=env) - assign('checklist', extractIndexTable(), envir=env) + assign('checklist', extract_index_table(), envir=env) } diff --git a/README.md b/README.md index d014638..2da1b87 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,16 @@ -`cleanEHR` is an R package for working with the Critical Care Health Informatics -Collaborative's data set. Since 2014 data from the critical care units at -Cambridge, Guys/Kings/St Thomas', Imperial, Oxford, and University College -London has been extracted and stored securely in a standardised format. +`cleanEHR` is an electronic health care record (EHR) data cleaning and +processing platform, which works with the Critical Care Health Informatics +Collaborative's data set. The purpose of the project is to enable researchers +to answer clinical questions that are important to patients, but which are +normally too difficult because data is unstandardised, siloed, and +inaccessible. -The purpose of the project is to enable researchers to answer clinical -questions that are important to patients, but which are normally too difficult -because data is unstandardised, siloed, and inaccessible. +Since 2014 data from the critical care units at Cambridge, Guys/Kings/St +Thomas', Imperial, Oxford, and University College London has been extracted and +stored securely in a standardised format. These data are crucially needed by healthcare professionals for the delivery and continuity of care; by administrators for audit, planning and service @@ -65,12 +67,12 @@ install.packages("cleanEHR") ### From Github to install the latest development version. ``` install.packages("devtools") -devtools::install_github("UCL-HIC/cleanEHR.git") +devtools::install_github("CC-HIC/cleanEHR") ``` ## Vignette -Check out the [tour](https://cc-hic.github.io/cleanEHR/tour.html) for a basic -introduction to cleanEHR +* Introduction to CCHIC critical care data [here](https://cc-hic.github.io/cleanEHR/cchic_overview.html) +* Data cleaning and wrangling with cleanEHR [here](https://cc-hic.github.io/cleanEHR/data_clean.html) ## How to contribute diff --git a/inst/doc/tour.R b/inst/doc/tour.R deleted file mode 100644 index 86d6d51..0000000 --- a/inst/doc/tour.R +++ /dev/null @@ -1,93 +0,0 @@ -## ------------------------------------------------------------------------ -library(cleanEHR) -data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData") -load(data.path) - -## ------------------------------------------------------------------------ -print(head(ccd@infotb)) - -## ------------------------------------------------------------------------ -# quickly check how many episodes are there in the dataset. -ccd@nepisodes - -## ---- fig.width=10, fig.height=11, out.width='700px', results='hide', message=FALSE, warning=FALSE---- -# check the heart rate, bilirubin, fluid balance, and drugs of episode_id = 7. -# NOTE: due to anonymisation reason, some episodes data cannot be displayed -# properly. -episode.graph(ccd, 7, c("h_rate", "bilirubin", "fluid_balance_d")) - -## ---- fig.width=10, fig.height=6, out.width='700px', results='hide', message=FALSE, warning=FALSE---- -# contains all the 1D fields i.e. non-longitudinal -tb1 <- sql.demographic.table(ccd) - -# filter out all dead patient. (All patients are dead in the dataset.) -tb1 <- tb1[DIS=="D"] - -# subset variables we want (ARSD = Advanced respiratory support days, -# apache_prob = APACHE II probability) -tb <- tb1[, c("SEX", "ARSD", "apache_prob"), with=F] -tb <- tb[!is.na(apache_prob)] - -# plot -library(ggplot2) -ggplot(tb, aes(x=apache_prob, y=ARSD, color=SEX)) + geom_point() - - -## ------------------------------------------------------------------------ -# To prepare a YAML configuration file like this. You write the following text -# in a YAML file. -conf <- " -NIHR_HIC_ICU_0108: - shortName: hrate -NIHR_HIC_ICU_0112: - shortName: bp_sys_a - dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure -NIHR_HIC_ICU_0093: - shortName: sex -" -library(yaml) -tb <- create.cctable(ccd, yaml.load(conf), freq=1) - -# a lazy way to do that. -tb <- create.cctable(ccd, list(NIHR_HIC_ICU_0108=list(), - NIHR_HIC_ICU_0112=list(), - NIHR_HIC_ICU_0093=list()), - freq=1) -print(tb$tclean) - -## ------------------------------------------------------------------------ -tb$tclean[, mean(NIHR_HIC_ICU_0108, na.rm=T), by=c("site", "episode_id")] - -## ---- fig.width=12, fig.height=12, out.width='700px', results='hide', message=FALSE, warning=FALSE---- -conf <-" -NIHR_HIC_ICU_0108: - shortName: hrate - dataItem: Heart rate - distribution: normal - decimal_places: 0 - range: - labels: - red: (0, 300) - amber: (11, 150) - apply: drop_entry - missingness: # remove episode if missingness is higher than 70% in any 24 hours interval - labels: - yellow: 24 - accept_2d: - yellow: 70 - apply: drop_episode -" - -ctb <- create.cctable(ccd, yaml.load(conf), freq=1) -ctb$filter.ranges("amber") # apply range filters -ctb$filter.missingness() -ctb$apply.filters() - -cptb <- rbind(cbind(ctb$torigin, data="origin"), - cbind(ctb$tclean, data="clean")) - - -ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + - geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x") - - diff --git a/inst/doc/tour.Rmd b/inst/doc/tour.Rmd deleted file mode 100644 index f1e8eef..0000000 --- a/inst/doc/tour.Rmd +++ /dev/null @@ -1,156 +0,0 @@ ---- -title: "A brief tour of cleanEHR" -author: David Perez Suarez & Sinan Shi -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Tour} - \usepackage[utf8]{inputenc} ---- - - -### Load data -Usually a RData file which stores all the dataset will be given. A sample RData -can be found in `/doc/sample_ccd.RData`. -```{r} -library(cleanEHR) -data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData") -load(data.path) -``` -### Data overview -You can have a quick overview of the data by checking `infotb`. In the sample -dataset, sensitive variables such as NHS number and admission time have been -removed or twisted. -```{r} -print(head(ccd@infotb)) -``` -The basic entry of the data is episode which indicates an admission of a site. -Using `episode_id` and `site_id` can locate a unique admission entry. `pid` is -a unique patient identifier. - -```{r} -# quickly check how many episodes are there in the dataset. -ccd@nepisodes -``` - -There are 263 fields which covers patient demographics, physiology, laboratory, -and medication information. Each field has 2 labels, NHIC code and short name. -There is a function `lookup.items()` to look up the fields you need. -`lookup.items()` function is case insensitive and allows fuzzy search. -``` -# searching for heart rate -lookup.items('heart') # fuzzy search - -+-------------------+--------------+--------------+--------+-------------+ -| NHIC.Code | Short.Name | Long.Name | Unit | Data.type | -+===================+==============+==============+========+=============+ -| NIHR_HIC_ICU_0108 | h_rate | Heart rate | bpm | numeric | -+-------------------+--------------+--------------+--------+-------------+ -| NIHR_HIC_ICU_0109 | h_rhythm | Heart rhythm | N/A | list | -+-------------------+--------------+--------------+--------+-------------+ - -``` - -### Inspect individual episode -```{r, fig.width=10, fig.height=11, out.width='700px', results='hide', message=FALSE, warning=FALSE} -# check the heart rate, bilirubin, fluid balance, and drugs of episode_id = 7. -# NOTE: due to anonymisation reason, some episodes data cannot be displayed -# properly. -episode.graph(ccd, 7, c("h_rate", "bilirubin", "fluid_balance_d")) -``` - -## Non-longitudinal Data -`sql.demographic.table()` can generate a `data.table` that contains all the -non-longitudinal variables. A demonstration of how to do some work on a subset -of data. -```{r, fig.width=10, fig.height=6, out.width='700px', results='hide', message=FALSE, warning=FALSE} -# contains all the 1D fields i.e. non-longitudinal -tb1 <- sql.demographic.table(ccd) - -# filter out all dead patient. (All patients are dead in the dataset.) -tb1 <- tb1[DIS=="D"] - -# subset variables we want (ARSD = Advanced respiratory support days, -# apache_prob = APACHE II probability) -tb <- tb1[, c("SEX", "ARSD", "apache_prob"), with=F] -tb <- tb[!is.na(apache_prob)] - -# plot -library(ggplot2) -ggplot(tb, aes(x=apache_prob, y=ARSD, color=SEX)) + geom_point() - -``` - -## Longitudinal data -To deal with longitudinal data, we need to first to transform it into a long -table format. - -### Create a `cctable` -```{r} -# To prepare a YAML configuration file like this. You write the following text -# in a YAML file. -conf <- " -NIHR_HIC_ICU_0108: - shortName: hrate -NIHR_HIC_ICU_0112: - shortName: bp_sys_a - dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure -NIHR_HIC_ICU_0093: - shortName: sex -" -library(yaml) -tb <- create.cctable(ccd, yaml.load(conf), freq=1) - -# a lazy way to do that. -tb <- create.cctable(ccd, list(NIHR_HIC_ICU_0108=list(), - NIHR_HIC_ICU_0112=list(), - NIHR_HIC_ICU_0093=list()), - freq=1) -print(tb$tclean) -``` - -### Manipulate on `cctable` -* Get the mean heart rate of each patient. -```{r} -tb$tclean[, mean(NIHR_HIC_ICU_0108, na.rm=T), by=c("site", "episode_id")] -``` - - -### Data cleaning -To clean the data, one needs to write the specification in the YAML -configuration file. - -```{r, fig.width=12, fig.height=12, out.width='700px', results='hide', message=FALSE, warning=FALSE} -conf <-" -NIHR_HIC_ICU_0108: - shortName: hrate - dataItem: Heart rate - distribution: normal - decimal_places: 0 - range: - labels: - red: (0, 300) - amber: (11, 150) - apply: drop_entry - missingness: # remove episode if missingness is higher than 70% in any 24 hours interval - labels: - yellow: 24 - accept_2d: - yellow: 70 - apply: drop_episode -" - -ctb <- create.cctable(ccd, yaml.load(conf), freq=1) -ctb$filter.ranges("amber") # apply range filters -ctb$filter.missingness() -ctb$apply.filters() - -cptb <- rbind(cbind(ctb$torigin, data="origin"), - cbind(ctb$tclean, data="clean")) - - -ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + - geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x") - -``` diff --git a/inst/doc/tour.html b/inst/doc/tour.html deleted file mode 100644 index 5a324dc..0000000 --- a/inst/doc/tour.html +++ /dev/null @@ -1,285 +0,0 @@ - - - - - - - - - - - - - - - - -A brief tour of cleanEHR - - - - - - - - - - - - - - - - - -

A brief tour of cleanEHR

-

David Perez Suarez & Sinan Shi

-

2017-01-31

- - - -
-

Load data

-

Usually a RData file which stores all the dataset will be given. A sample RData can be found in /doc/sample_ccd.RData.

-
library(cleanEHR)
-data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData")
-load(data.path)
-
-
-

Data overview

-

You can have a quick overview of the data by checking infotb. In the sample dataset, sensitive variables such as NHS number and admission time have been removed or twisted.

-
print(head(ccd@infotb))
-
##        site_id episode_id nhs_number pas_number         t_admission
-## 1: pseudo_site          1         NA         NA 1970-01-01 01:00:00
-## 2: pseudo_site          2         NA         NA 1970-01-01 01:00:00
-## 3: pseudo_site          3         NA         NA 1970-01-01 01:00:00
-## 4: pseudo_site          4         NA         NA 1970-01-01 01:00:00
-## 5: pseudo_site          5         NA         NA 1970-01-01 01:00:00
-## 6: pseudo_site          6         NA         NA 1970-01-01 01:00:00
-##    t_discharge parse_file parse_time pid index
-## 1:        <NA>         NA       <NA>   1     1
-## 2:        <NA>         NA       <NA>   2     2
-## 3:        <NA>         NA       <NA>   3     3
-## 4:        <NA>         NA       <NA>   4     4
-## 5:        <NA>         NA       <NA>   5     5
-## 6:        <NA>         NA       <NA>   6     6
-

The basic entry of the data is episode which indicates an admission of a site. Using episode_id and site_id can locate a unique admission entry. pid is a unique patient identifier.

-
# quickly check how many episodes are there in the dataset.
-ccd@nepisodes
-
## [1] 30
-

There are 263 fields which covers patient demographics, physiology, laboratory, and medication information. Each field has 2 labels, NHIC code and short name. There is a function lookup.items() to look up the fields you need. lookup.items() function is case insensitive and allows fuzzy search.

-
# searching for heart rate
-lookup.items('heart') # fuzzy search
-
-+-------------------+--------------+--------------+--------+-------------+
-|     NHIC.Code     |  Short.Name  |  Long.Name   |  Unit  |  Data.type  |
-+===================+==============+==============+========+=============+
-| NIHR_HIC_ICU_0108 |    h_rate    |  Heart rate  |  bpm   |   numeric   |
-+-------------------+--------------+--------------+--------+-------------+
-| NIHR_HIC_ICU_0109 |   h_rhythm   | Heart rhythm |  N/A   |    list     |
-+-------------------+--------------+--------------+--------+-------------+
-
-
-
-

Inspect individual episode

-
# check the heart rate, bilirubin, fluid balance, and drugs of episode_id = 7. 
-# NOTE: due to anonymisation reason, some episodes data cannot be displayed
-# properly. 
-episode.graph(ccd, 7, c("h_rate",  "bilirubin", "fluid_balance_d"))
-

-
-
-

Non-longitudinal Data

-

sql.demographic.table() can generate a data.table that contains all the non-longitudinal variables. A demonstration of how to do some work on a subset of data.

-
# contains all the 1D fields i.e. non-longitudinal
-tb1 <- sql.demographic.table(ccd)
-
-# filter out all dead patient. (All patients are dead in the dataset.)
-tb1 <- tb1[DIS=="D"]
-
-# subset variables we want (ARSD = Advanced respiratory support days,
-# apache_prob = APACHE II probability)
-tb <- tb1[, c("SEX", "ARSD", "apache_prob"), with=F]
-tb <- tb[!is.na(apache_prob)]
-
-# plot
-library(ggplot2)
-ggplot(tb, aes(x=apache_prob, y=ARSD, color=SEX)) + geom_point()
-

-
-
-

Longitudinal data

-

To deal with longitudinal data, we need to first to transform it into a long table format.

-
-

Create a cctable

-
# To prepare a YAML configuration file like this. You write the following text
-# in a YAML file. 
-conf <- "
-NIHR_HIC_ICU_0108:
-  shortName: hrate
-NIHR_HIC_ICU_0112:
-  shortName: bp_sys_a
-  dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure
-NIHR_HIC_ICU_0093:
-   shortName: sex
-"
-library(yaml)
-tb <- create.cctable(ccd, yaml.load(conf), freq=1)
-
-# a lazy way to do that. 
-tb <- create.cctable(ccd, list(NIHR_HIC_ICU_0108=list(), 
-                         NIHR_HIC_ICU_0112=list(), 
-                         NIHR_HIC_ICU_0093=list()), 
-                     freq=1)
-print(tb$tclean)
-
##       time NIHR_HIC_ICU_0108 NIHR_HIC_ICU_0112 NIHR_HIC_ICU_0093
-##    1:    0                64                NA                 F
-##    2:    1                71                NA                 F
-##    3:    2                71                NA                 F
-##    4:    3                80                NA                 F
-##    5:    4                NA                NA                 F
-##   ---                                                           
-## 7932:  690                NA                NA                 M
-## 7933:  691                NA                NA                 M
-## 7934:  692                NA                NA                 M
-## 7935:  693                NA                NA                 M
-## 7936:  694                NA                NA                 M
-##              site episode_id NIHR_HIC_ICU_0112.meta
-##    1: pseudo_site          1                     NA
-##    2: pseudo_site          1                     NA
-##    3: pseudo_site          1                     NA
-##    4: pseudo_site          1                     NA
-##    5: pseudo_site          1                     NA
-##   ---                                              
-## 7932: pseudo_site          9                     NA
-## 7933: pseudo_site          9                     NA
-## 7934: pseudo_site          9                     NA
-## 7935: pseudo_site          9                     NA
-## 7936: pseudo_site          9                     NA
-
-
-

Manipulate on cctable

-
    -
  • Get the mean heart rate of each patient.
  • -
-
tb$tclean[, mean(NIHR_HIC_ICU_0108, na.rm=T), by=c("site", "episode_id")]
-
##            site episode_id        V1
-##  1: pseudo_site          1  73.00000
-##  2: pseudo_site         10  80.70370
-##  3: pseudo_site         11  87.57143
-##  4: pseudo_site         12  95.61667
-##  5: pseudo_site         13 130.09091
-##  6: pseudo_site         14       NaN
-##  7: pseudo_site         15 117.50000
-##  8: pseudo_site         16       NaN
-##  9: pseudo_site         17  88.40719
-## 10: pseudo_site         18       NaN
-## 11: pseudo_site         19  89.50845
-## 12: pseudo_site          2 103.14615
-## 13: pseudo_site         20  72.02439
-## 14: pseudo_site         21       NaN
-## 15: pseudo_site         22       NaN
-## 16: pseudo_site         23  98.48810
-## 17: pseudo_site         24 123.16566
-## 18: pseudo_site         25       NaN
-## 19: pseudo_site         26  87.63636
-## 20: pseudo_site         27 121.37143
-## 21: pseudo_site         28 111.96195
-## 22: pseudo_site         29  75.40000
-## 23: pseudo_site          3       NaN
-## 24: pseudo_site         30  53.00000
-## 25: pseudo_site          4 114.60386
-## 26: pseudo_site          5 102.20000
-## 27: pseudo_site          6 107.87650
-## 28: pseudo_site          7  70.40595
-## 29: pseudo_site          8       NaN
-## 30: pseudo_site          9       NaN
-##            site episode_id        V1
-
-
-

Data cleaning

-

To clean the data, one needs to write the specification in the YAML configuration file.

-
conf <-"
-NIHR_HIC_ICU_0108:
-  shortName: hrate
-  dataItem: Heart rate
-  distribution: normal
-  decimal_places: 0
-  range:
-    labels:
-      red: (0, 300)
-      amber: (11, 150)
-    apply: drop_entry
-  missingness: # remove episode if missingness is higher than 70% in any 24 hours interval 
-    labels:
-      yellow: 24
-    accept_2d:
-      yellow: 70 
-    apply: drop_episode
-"
-
-ctb <- create.cctable(ccd, yaml.load(conf), freq=1)
-ctb$filter.ranges("amber") # apply range filters
-ctb$filter.missingness()
-ctb$apply.filters()
-
-cptb <- rbind(cbind(ctb$torigin, data="origin"), 
-              cbind(ctb$tclean, data="clean"))
-
-
-ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + 
-  geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x")
-

-
-
- - - - - - - - diff --git a/inst/pipeline/break_into.sh b/inst/pipeline/break_into.sh deleted file mode 100755 index ac16606..0000000 --- a/inst/pipeline/break_into.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -xe - -if [[ $# < 2 ]] -then - echo "You need to input a file name and the number of patients per file" - echo "$0 filename.xml maxpatients" - exit 1 -fi - -default_ext='partxml' -# or file? -dchar="d:" -dchar_exist=$(head -2 $1 | grep -c "<${dchar}") -[ ${dchar_exist} -eq 0 ] && dchar="" - -subject="${dchar}subject" - -# Add lines before and after each subject starts -sed -i.orig -e 's|<'"${subject}"'>|\n<'"${subject}"'>\n|' ${1} - -# change the end subject for something different - -# so it is not counted in the awk below. - sed -i -e 's||\n|' ${1} - -# Remove previous files -if [[ -e ${1}_0.${default_ext} ]]; then - rm ${1}*${default_ext} -fi - -# Break the file into chunks where <${subject}> occurs. -# Each time is found, delim will increase -# if delim/maxpatients (2nd argument) == 1 then -# create a new file. -# initialising delim as -1 so the first file includes the -# number of subjects asked. -awk 'BEGIN {delim=-1} \ - /\<'"${subject}"'\>/ { delim++ } \ - {file = sprintf("'${1}'_%s.'${default_ext}'", int(delim/'${2}'));\ - print >> file; } \ - END { print "'${1}' has ", delim+1, "subjects"}' ${1} - - -# extract the header of the file with its meta. -# - extract everything till the first (what's used to separate the file) -# - remove the instance for so it's not repeated when inserted. -# - remove all no printing characters - it seems there's one making the insertion to -# fail afterwards. -# head won't work because some files run over multiple lines -firstlines=$(sed -n '1,/<'"${subject}"'>/p' ${1}_0.${default_ext} | \ - sed 's/<'"${subject}"'>//' | tr -dc '[:print:]') - -# TODO: to extract the footer automatically. -lastline="" -nfiles=$(ls "${1}"_* | wc -l) - -# loop over all the files to add header and footer for each file that needs it - -for ((i=0; i<${nfiles}; i++)) -do - output=${1}_${i}.${default_ext} - # replace the label changed before - sed -i -e "s|||" ${output} - - # add footer to the files - if [ $i -lt $((${nfiles} - 1)) ] - then - echo "$lastline" >> ${output} - fi - - # add header to the files - if [ $i -gt 0 ] - then - sed -i "1s|^|${firstlines}|" ${output} - fi -done - -# Remove the temporary file used -mv ${1}.orig ${1} diff --git a/inst/pipeline/ccdata2cleanEHR.r b/inst/pipeline/ccdata2cleanEHR.r deleted file mode 100644 index 7e1d112..0000000 --- a/inst/pipeline/ccdata2cleanEHR.r +++ /dev/null @@ -1,22 +0,0 @@ -# This script is to address the ccdata and cleanEHR data object -# incompatible problem. You need both ccdata and cleanEHR being installed. - - -library(ccdata) -file <- (commandArgs(TRUE)[1]) -load(file) -data_obj <- ls()[1] - - - -new_obj <- cleanEHR::ccRecord() + - for_each_episode(eval(parse(text = data_obj)), - function(x) { - cleanEHR::new.episode(x@data) - }) - - -assign(data_obj, new_obj) - -file_name <- strsplit(file, "[.]")[[1]][1] -save(file=paste0(file_name, "_v2.RData"), list=data_obj) diff --git a/inst/pipeline/combine_data.r b/inst/pipeline/combine_data.r deleted file mode 100755 index e3867ac..0000000 --- a/inst/pipeline/combine_data.r +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/Rscript - -library(cleanEHR) - -args = commandArgs(trailingOnly=TRUE) - -institute <- c("CUH", "Oxford", "GSTT", "imperial", "UCLH") - -data<-dir(institute, full.name=TRUE)[grep(".RData", dir(institute, - full.name=TRUE))] - -new <- ccRecord() -for (d in data) { - print(d) - load(d) - new <- new + ccd -} - -ccd <- new - -save(ccd, file=args[1]) diff --git a/inst/pipeline/convert.r b/inst/pipeline/convert.r deleted file mode 100644 index fdb095f..0000000 --- a/inst/pipeline/convert.r +++ /dev/null @@ -1,13 +0,0 @@ -ym <- yaml.load_file('data/ITEM_REF.yaml') -csv <- read.csv("shortname2.csv") -csv <- csv[2:3] - -for (i in seq(nrow(csv))) { - code <- - as.character(data.checklist$NHICcode[data.checklist$dataItem==as.character(csv[i,2])]) - ym[[code]]$shortName <- as.character(csv[i, 1]) -} - -f <- file("new.yml") -writeLines(con=f, as.yaml(ym)) -close(f) diff --git a/inst/pipeline/extract_data.r b/inst/pipeline/extract_data.r deleted file mode 100755 index 8c2098b..0000000 --- a/inst/pipeline/extract_data.r +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/Rscript - -library(cleanEHR) - -args = commandArgs(trailingOnly=TRUE) - -cat("extract patients from", c(args[1], "\n")) -ccd <- xml2Data(args[1]) -save(ccd, file=paste(args[1], ".RData", sep="_")) diff --git a/inst/pipeline/pipeline.sh b/inst/pipeline/pipeline.sh deleted file mode 100755 index 4158d65..0000000 --- a/inst/pipeline/pipeline.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# Pipeline code to generate the ccdata structure - - -DEFAULT_SPACES='xml' -allPatients='all_patients.RData' -allPatients_untime='delta_num.RData' - -function remove_spaces (){ - mv "${1}" "${1// /_}" -} -# make the function available as a command so find can use it. -export -f remove_spaces - -function remove_spaces_ext (){ - if [ -z "$1" ] - then - echo "-- removing spaces on \"${DEFAULT_SPACES}\" files -- " - else - echo "-- removing spaces on \"${1}\" files -- " - fi - - extension=${1-$DEFAULT_SPACES} - - FILES_spaced_num=$(find -L ./ -type f -iname '* *.'"${extension}" | wc -l) - find -L ./ -type f -iname '*\ *.'"${extension}" -exec bash -c 'remove_spaces "{}"' \; - # if you try to do this as a loop read: - # http://mywiki.wooledge.org/BashPitfalls#for_i_in_.24.28ls_.2A.mp3.29 - - echo "-- converted ${FILES_spaced_num} files --" -} - - -#============================================================ -# remove spaces from file names, otherwise xargs won't parallelise -#============================================================ -remove_spaces_ext - -#============================================================ -# Break files into smaller chunks in parallel -# - find the files, sort them by size (%k) -# - extract the file names and run break_into 4 at a time -# => filename.xml_xx.xml; where xx is a non padded number -#============================================================ -find -L ./ -type f -iname '*.xml' -printf "%k %p\n" | sort -nr \ - | awk '{print $2}' | xargs -n1 -P 4 -I % ./break_into.sh % 3 - -#============================================================ -# Convert each portion to ccdata -#============================================================ -find -L ./ -type f -iname '*.partxml' | xargs -n1 -P 4 ./extract_data.r - -#============================================================ -# Combine all the files -#============================================================ -./combine_data.r ${allPatients} - -#============================================================ -# Anonymise data removing timestamp -#============================================================ -./untimeit.r ${allPatients} ${allPatients_untime} - -echo "Files ${allPatients} and ${allPatients_untime} created." diff --git a/inst/pipeline/reduce_xml.sh b/inst/pipeline/reduce_xml.sh deleted file mode 100755 index 4502bf2..0000000 --- a/inst/pipeline/reduce_xml.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# example: -# ./reduce_xml anon_CC.xml 10 - -if [ -e output.xml ] -then - rm output.xml -fi - -patient_num=$(($2-1)) - -touch output.xml -count=0 - -while read line -do - if [ $count -gt $patient_num ] - then - echo 'end' - break - else - echo $line >> output.xml - if echo $line|grep -q '<\/d:subject>' - then - count=$(($count+1)) - fi - fi -done < $1 - -tail -n 1 $1 >> output.xml diff --git a/inst/pipeline/shortname2.csv b/inst/pipeline/shortname2.csv deleted file mode 100644 index b627bef..0000000 --- a/inst/pipeline/shortname2.csv +++ /dev/null @@ -1 +0,0 @@ -,short,item 1,ICNNO,Site code (ICNARC CMP number) 2,bed02,CCU bed configuration 02 3,bed03,CCU bed configuration 03 4,bed05,CCU bed configuration 05 5,bed50,CCU bed configuration 90 6,NHSNO,NHS number 7,pasno,PAS number 8,ADNO,Critical care local identifier / ICNARC admission number 9,DOB,Date of birth 10,PCODE,Postcode 11,GPCODE,Code of GP 12,ETHNIC,Ethnicity 13,SEX,Sex 14,HCM,Height 15,HCMEST,Height (Source) 16,WKG,Weight 17,WKGEST,Weight (Source) 18,DAH,Date of admission to your hospital 19,DAICU,Date & Time of admission to your unit 20,DWFRD,Date fully ready for discharge 21,TWFRD,Time fully ready for discharge 22,DDICU,Date & Time of discharge from your unit 23,DUDICU,Date of ultimate discharge from ICU/HDU 24,DDH,Date of discharge from your hospital 25,DUDH,Date of ultimate discharge from your hospital 26,UDIS,Status at ultimate discharge from ICUHDU 27,RESA,Residence prior to admission to acute hospital 28,HLOCA,Hospital housing location (in) 29,LOCA,Location (in) 30,PA_V3,Admission type 31,SCODE,Treatment function code 32,CLASSNS,classification of surgery 33,CCL2D,Level 2 (HDU) days 34,CCL3D,Level 3 (ICU) days 35,RDIS_V3,Discharge status (Reason for discharge from your unit) 36,LOCD,Discharge location (location out) 37,TNESSD,Timeliness of discharge from your unit 38,LEVD,Level of care at discharge from your unit 39,DIS,Dead or alive on discharge 40,DOAH,Date of original admission to/attendance at acute hospital 41,SOHA,Sector of other hospital (in) 42,PLOCA,Prior location (in) 43,TUADNO,Transferring unit admission number 44,DOAICU,Date of original admission to ICU/HDU 45,TUIDI,Transferring unit identifier (in) 46,TYPEIHA,Type of adult ICU/HDU (in) 47,TGA,Adult ICU/HDU within your critical care transfer group (in) 48,CCA,Critical care visit prior to this admission to your unit 49,DLCCA,Date of last critical care visit prior to this admission to your unit 50,HLOCD,Hospital housing location (out) 51,SOHD,Sector of other hospital (out) 52,TYPEIHD,Type of adult ICU/HDU (out) 53,TGD,Adult ICU/HDU within your critical care transfer group (out) 54,HDIS,Status at discharge from your hospital 55,CCD,Critical care visit post-discharge from your unit 56,DFCCD,Date of first critical care post-discharge from your unit 57,DESTH_V3,Destination post discharge within your hospital 58,RESD,Residence post discharge from acute hospital 59,UHDIS,Status at ultimate discharge from hospital 60,PSP,Admission for pre-surgical preparation 61,CPR_V3,Cardiopulmonary resuscitation within 24 hours prior to admission to unit 62,RAICU1,Primary reason for admission to your unit 63,RAICU2,Secondary reasons for admission to your unit 64,URAICU,Ultimate primary reason for admission to unit 65,CHEMOX,Chemotherapy (within the last 6months) steroids alone excluded 66,RADIOX,Radiotherapy 67,CICIDS,Congenital immunohumoral or cellular immune deficiency state 68,AMLALLMM,Acute myeloid/lymphocytic leukaemia or myeloma 69,CMLCLL,Chronic myelogenous /lymphocytic leukaemia 70,LYM,Lymphoma 71,META,Metastatic disease 72,CRRX,Chronic renal replacement therapy 73,BPC,Biopsy proven cirrhosis 74,HE,Heaptic encephalopathy 75,PH,Portal hypertension 76,AIDS_V3,HIV/AIDS 77,HV,Home ventilation 78,SRD,Severe respiratory disease 79,STERX,Steroid treatment 80,VSCD,Very severe cardiovascular disease 81,OCPMH,Other condition in past medical history 82,DEP,Dependency prior to admission 83,apache_score,APACHE II Score 84,apache_prob,APACHE II Probability 85,TNESSA,Delayed admission 86,DHRS,Delay 87,AMUAI,Antimicrobial use after 48 hours in your unit 88,ORGAN_SUPPORT,Organ support maximum 89,ARSD,Advanced respiratory support days 90,BRSD,Basic respiratory support days 91,ACSD,Advanced Cardiovascular support days 92,BCSD,Basic Cardiovascular support days 93,RSD,Renal support days 94,NSD,Neurological support days 95,LSD,Liver support days 96,DSD,Dermatological support days 97,GSD,Gastrointestinal support days 98,ITW_V3,Treatment withheld/withdrawn 99,DTW,Date treatment first withdrawn 100,TTW,Time treatment first withdrawn 101,BSDTP,Brain stem death declared 102,DDBSD,Date of declaration of brain stem death 103,TDBSD,Time of declaration of brain stem death 104,DOD,Date of death on yout unit 105,TOD,Time of death on your unit 106,REFOD,Referred for solid organ or tissue donation 107,OD_V3,Solid organ or tissue donor 108,DBRICU,Date body removed from your unit 109,TBRICU,Time body removed from your unit 110,advsupt_resp,Advanced respiratory support 111,basicsupt_resp,Basic respiratory support 112,advsupt_resp,Advanced Cardiovascular support 113,basicsupt_cardv,Basic Cardiovascular support 114,supt_renal,Renal support 115,supt_neuro,Neurological support 116,supt_liver,Liver support 117,supt_dermat,Dermatological support 118,supt_gastr,Gastrointestinal support 119,gcs_total,GCS - total 120,gcs_motor,GCS - motor component 121,gcs_eye,GCS - eye component 122,gcs_verbal,GCS - verbal component 123,sedation_score,Sedation score (hourly) 124,sedataion,Sedation yes/no 125,h_rate,Heart rate 126,h_rhythm,Heart rhythm 127,bp_m_a,Mean arterial blood pressure - Art BPMean arterial blood pressure 128,bp_m_ni,Mean arterial blood pressure - NBPMean arterial blood pressure 129,bp_sys_a,Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure 130,bp_sys_ni,Systolic Arterial blood pressure - NBPSystolic Arterial blood pressure 131,bp_dia_a,Diastolic arterial blood pressure - Art BPDiastolic arterial blood pressure 132,bp_dia_ni,Diastolic arterial blood pressure - NBPDiastolic arterial blood pressure 133,venous_p,Central venous pressure 134,lidco_plus,Cardiac output - LiDCO Plus 135,lidco_rapid,Cardiac output - LiDCO Rapid 136,picco,Cardiac output - PICCO 137,pa_catheter,Cardiac output - PA Catheter 138,doppler,Cardiac output - Doppler 139,airway,Airway 140,ventilation,Invasive or non-invasive (ventilation) 141,respiratory_rate_spon,Spontaneous Respiratory Rate 142,respiratory_rate_totl,Total respiratory rate (monitor) 143,spo2,SpO2 144,pao2_fio2,PaO2/FiO2 ratio 145,fraction_oxygen,Inspired fraction of oxygen 146,pe_expiratory_pressure,Positive End Expiratory Pressure 147,mand_resp_rate,Mandatory Respiratory Rate 148,total_resp_rate,Total respiratory rate (ventilator) 149,minute_volume,Minute volume 150,tidal_volume,Tidal volume 151,airway_pressure,Peak airway pressure 152,frequency,Frequency (Hz) 153,cycle_volumn,Cycle Volume 154,base_flow,Base flow 155,airway_pressure,Airway pressure 156,fluid_balance,Fluid Balance (daily) 157,fluid_balance,Fluid Balance (hourly) 158,urine_output,Urine output 159,renal_replace,Renal replacement mode 160,duration_therapy,Duration of therapy (hours per day) 161,effluent_per_day,Total effluent per day 162,dialysate,Dialysate 163,replace_fluid_RRT,Replacement fluid during RRT 164,anticoagulation,Type of anticoagulation 165,position,Position 166,temperature_central,Temperature - Central 167,temperature_non_central,Temperature - Non-central 168,sodium,Sodium 169,sodium_abg_vbg,Sodium ABG/VBG 170,potassium,Potassium 171,potassium_abg_vbg,Potassium ABG/VBG 172,urea,Urea 173,creatinine,Creatinine 174,bilirubin,Bilirubin 175,glucose_lab,Glucose (laboratory) 176,glucode_abg_vbg,Glucose ABG/VBG 177,glucode_bedtest,Glucose bedside test 178,cprotein,C reactive protein 179,ph_abg_vbg,pH - ABG / VBG 180,hco3_abg_vbg,HCO3 - ABG / VBG 181,lactate_abg,Lactate - ABG 182,lactate_lab,Lactate - Lab 183,haemoglobin,Haemoglobin 184,haemoglobin_abg_vbg,Haemoglobin ABG/VBG 185,white_cell,White cell count 186,neutrophil,Neutrophil count 187,platelets,Platelets 188,site,Site 189,organism,Organism 190,sensitivity,Sensitivity 191,pao2_abg,PaO2 - ABG 192,sao2_abg,SaO2 - ABG 193,venous_saturation,Central venous saturation 194,paco2_abg,PaCO2 - ABG 195,amikacin,Amikacin 196,gentamicin,Gentamicin 197,neomycin,Neomycin 198,tobramycin,Tobramycin 199,pentamidine,Pentamidine 200,ethambutal hcl,Ethambutal HCL 201,isoniazid,Isoniazid 202,pyrazinamide,Pyrazinamide 203,rifampacin,Rifampacin 204,rifater,Rifater 205,rifinah,Rifinah 206,ertapenem,Ertapenem 207,meropenem,Meropenem 208,cefotaxime,Cefotaxime 209,ceftazidime,Ceftazidime 210,ceftriaxone,Ceftriaxone 211,cefuroxime,Cefuroxime 212,chloramphenicol,Chloramphenicol 213,fuscidic acid,Fuscidic acid 214,sodium fusidate,Sodium Fusidate 215,teicoplanin,Teicoplanin 216,vancomycin,Vancomycin 217,clindamycin,Clindamycin 218,azithromycin,Azithromycin 219,clarithromycin,Clarithromycin 220,erythromycin,Erythromycin 221,nitrofurantion,Nitrofurantion 222,metronidazole,Metronidazole 223,linezolid,Linezolid 224,amoxicillin,Amoxicillin 225,benzylpenicillin,Benzylpenicillin 226,co-amoxiclav,Co-Amoxiclav 227,flucloxacillin,Flucloxacillin 228,phenoxymethylpenicillin,Phenoxymethylpenicillin 229,piperacillin_tazobactam,Piperacillin/Tazobactam 230,colistin,Colistin 231,ciprofloxacin,Ciprofloxacin 232,levofloxacin,Levofloxacin 233,moxifloxacin,Moxifloxacin 234,ofloxacin,Ofloxacin 235,co-trimoxazole,Co-Trimoxazole 236,trimethoprim,Trimethoprim 237,demeclocycline hcl,Demeclocycline HCL 238,doxycycline,Doxycycline 239,tigecycline,Tigecycline 240,propofol,Propofol 241,thiopentone_thiopental,Thiopentone / Thiopental 242,midazolam,Midazolam 243,clonidine,Clonidine 244,dexmedetomidine,Dexmedetomidine 245,ketamine,Ketamine 246,fentanyl,Fentanyl 247,morphine,Morphine 248,remifentanil,Remifentanil 249,levosimendan,Levosimendan 250,adrenaline,Adrenaline 251,dobutamine,Dobutamine 252,dopamine,Dopamine 253,dopexamine,Dopexamine 254,enoximone,Enoximone 255,milrinone,Milrinone 256,noradrenaline,Noradrenaline 257,vasopressin,Vasopressin 258,terlipressin,Terlipressin 259,esmolol,Esmolol 260,metoprolol,Metoprolol 261,dexamethasone,Dexamethasone 262,hydrocortisone,Hydrocortisone 263,methylprednisolone,Methylprednisolone \ No newline at end of file diff --git a/inst/pipeline/untimeit.r b/inst/pipeline/untimeit.r deleted file mode 100755 index 5ae5a3c..0000000 --- a/inst/pipeline/untimeit.r +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/Rscript - -library(cleanEHR) - -args = commandArgs(trailingOnly=TRUE) - -load(args[1]) -ccd <- reindexRecord(ccd) -ccd <- deltaTime(ccd, pseudotime=T) -ccd <- uniquePatients(ccd) -ccd_delta_num <- ccd -save(ccd_delta_num, file=args[2]) diff --git a/inst/report/data_quality_report.Rmd b/inst/report/data_quality_report.Rmd index 25af53a..07d707d 100644 --- a/inst/report/data_quality_report.Rmd +++ b/inst/report/data_quality_report.Rmd @@ -1,10 +1,10 @@ --- title: Data Quality Report nep: `r ccd@nepisodes` -np: `r max(ccd@infotb$pid, na.rm=T)` +np: `r max(ccd@infotb$pid, na.rm=TRUE)` site: `r s <- unique(ccd@infotb$site_id); s[s!="NA"]` -admt: `r min(ccd@infotb$t_admission, na.rm=T)` -disct: `r max(ccd@infotb$t_discharge, na.rm=T)` +admt: `r min(ccd@infotb$t_admission, na.rm=TRUE)` +disct: `r max(ccd@infotb$t_discharge, na.rm=TRUE)` dbupdate: `r max(ccd@infotb$parse_time)` fulldb: `r dbfull` @@ -34,14 +34,14 @@ demg <- sql.demographic.table(ccd) # Data Summary ```{r, echo=FALSE} dp <- total.data.point(ccd) -np <- max(ccd@infotb$pid, na.rm=T) +np <- max(ccd@infotb$pid, na.rm=TRUE) ``` This database contains **`r ccd@nepisodes`** episode data from **`r length(s[s!="NA"])`** sites. Based on the NHS numbers and PAS numbers, we can identify **`r np`** unique patients, among which the earliest -admission is `r min(ccd@infotb$t_admission, na.rm=T)` and the latest discharge time is -`r max(ccd@infotb$t_discharge, na.rm=T)`. There are `r dp ` +admission is `r min(ccd@infotb$t_admission, na.rm=TRUE)` and the latest discharge time is +`r max(ccd@infotb$t_discharge, na.rm=TRUE)`. There are `r dp ` total data points found in the current database, which makes in average `r round(dp/np)` per unique patient. @@ -54,7 +54,7 @@ pander(as.data.frame(site.info()[, 1:3], style="rmarkdown")) ## The original XML files and parse information ```{r, echo=FALSE, results="asis"} fs <- file.summary(ccd) -pander(as.data.frame(fs[, c("File", "Number of Episode", "Sites"), with=F])) +pander(as.data.frame(fs[, c("File", "Number of Episode", "Sites"), with=FALSE])) ``` ```{r fig.width=15, fig.height=10, echo=FALSE} diff --git a/inst/sql/create_table.sql b/inst/sql/create_table.sql deleted file mode 100644 index a5c4ca1..0000000 --- a/inst/sql/create_table.sql +++ /dev/null @@ -1,65 +0,0 @@ -/******************** Purge the database ************************/ -DROP TABLE IF EXISTS "episodes" CASCADE; -DROP TABLE IF EXISTS "demographic" CASCADE; -DROP TABLE IF EXISTS "physiology" CASCADE; - -DROP SEQUENCE IF EXISTS "episode_seq"; -CREATE SEQUENCE "episode_seq" INCREMENT BY 1; -DROP SEQUENCE IF EXISTS "demographic_seq"; -CREATE SEQUENCE "demographic_seq" INCREMENT BY 1; -DROP SEQUENCE IF EXISTS "physiology_seq"; -CREATE SEQUENCE "physiology_seq" INCREMENT BY 1; - ---ALTER TABLE "episodes" DROP CONSTRAINT IF EXISTS pkepisodes; -/******************** Add Table: "episodes" ************************/ -/* Build Table Structure */ -CREATE TABLE "episodes" -( - "Id" INTEGER DEFAULT nextval('episode_seq'::regclass) NOT NULL, - nhs_number CHAR(200) NULL, - pas_number CHAR(200) NULL, - site_id CHAR(200) NULL, - local_id CHAR(200) NULL -); - -/* Add Primary Key */ -ALTER TABLE "episodes" ADD CONSTRAINT pkepisodes -PRIMARY KEY ("Id"); - -/******************** Add Table: "demographic" ************************/ -/* Build Table Structure */ -CREATE TABLE "demographic" -( - "Id" INTEGER DEFAULT nextval('demographic_seq'::regclass) NOT NULL, - item CHAR(100) NOT NULL, - value CHAR(100) NULL, - episode_id INTEGER NOT NULL -); - -/* Add Primary Key */ -ALTER TABLE "demographic" ADD CONSTRAINT pkdemographic -PRIMARY KEY ("Id"); - -/******************** Add Table: "physiology" ************************/ -/* Build Table Structure */ -CREATE TABLE "physiology" -( - "Id" INTEGER DEFAULT nextval('physiology_seq'::regclass) NOT NULL, - item CHAR(100) NOT NULL, - value CHAR(100) NULL, - episode_id INTEGER NOT NULL, - time_stamp TIMESTAMP NOT NULL -); - -/* Add Primary Key */ -ALTER TABLE "physiology" ADD CONSTRAINT pkphysiology -PRIMARY KEY ("Id"); - - - -/************ Add Foreign Keys ***************/ - -/* Add Foreign Key: fk_demographic_episodes */ -ALTER TABLE "demographic" ADD CONSTRAINT fk_demographic_episodes -FOREIGN KEY (episode_id) REFERENCES "episodes" ("Id") -ON UPDATE NO ACTION ON DELETE NO ACTION; diff --git a/inst/sql/simple.sql b/inst/sql/simple.sql deleted file mode 100644 index 34d68c7..0000000 --- a/inst/sql/simple.sql +++ /dev/null @@ -1,35 +0,0 @@ -DROP TABLE IF EXISTS "demographic" CASCADE; -DROP TABLE IF EXISTS "measurement" CASCADE; - -DROP SEQUENCE IF EXISTS "episode_seq"; -CREATE SEQUENCE "episode_seq" INCREMENT BY 1; -DROP SEQUENCE IF EXISTS "demographic_seq"; -CREATE SEQUENCE "demographic_seq" INCREMENT BY 1; -DROP SEQUENCE IF EXISTS "measurement_seq"; -CREATE SEQUENCE "measurement_seq" INCREMENT BY 1; - -CREATE TABLE "measurement" -( - "Id" INTEGER DEFAULT nextval('measurement_seq'::regclass) NOT NULL, - item CHAR(100) NOT NULL, - value CHAR(100) NULL, - episode_id INTEGER NOT NULL, - time_stamp TIMESTAMP NOT NULL -); - - -/******************** Add Table: "demographic" ************************/ -CREATE TABLE "demographic" -( - "Id" INTEGER DEFAULT nextval('demographic_seq'::regclass) NOT NULL, - item CHAR(100) NOT NULL, - value CHAR(100) NULL, - episode_id INTEGER NOT NULL -); - - -/*CREATE TABLE "identifiers" -( - "Id" INTEGER DEFAULT nextval('episode_seq'::regclass) NOT NULL, - -);*/ diff --git a/inst/sql/test_mongo.r b/inst/sql/test_mongo.r deleted file mode 100644 index 15aa352..0000000 --- a/inst/sql/test_mongo.r +++ /dev/null @@ -1,13 +0,0 @@ -library(cleanEHR) -library(rmongodb) -library(assertthat) -mongo <- mongo.create() -assert_that(mongo.is.connected(mongo)) -# how many databases -print(mongo.get.databases(mongo)) -coll <- "ccd.ccd1" - -#for_each_episode(ccd_delta_num, -# function(x) { -# mongo.insert(mongo, coll, mongo.bson.from.list(x@data)) -# }) diff --git a/man/ITEM_REF.Rd b/man/ITEM_REF.Rd new file mode 100644 index 0000000..ea21f63 --- /dev/null +++ b/man/ITEM_REF.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{ITEM_REF} +\alias{ITEM_REF} +\title{Field reference table} +\description{ +Field reference table +} +\keyword{data} + diff --git a/man/add.episode.list.to.record.Rd b/man/add.episode.list.to.record.Rd deleted file mode 100644 index f7d6660..0000000 --- a/man/add.episode.list.to.record.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccRecord.R -\name{add.episode.list.to.record} -\alias{add.episode.list.to.record} -\title{Adding a list of ccEpisode to ccRecord} -\usage{ -add.episode.list.to.record(rec, lst) -} -\arguments{ -\item{rec}{ccRecord} - -\item{lst}{a list of ccEpisode objects} -} -\value{ -ccRecord -} -\description{ -Adding a list of one or multiple ccEpisode objects to a -ccRecord object, the information table (infotb) will be updated automatically. -It is the more efficient way to add multiple ccEpisode objects. See -add.episode.to.record() for just adding one ccEpisode. -} - diff --git a/man/add.episode.to.record.Rd b/man/add.episode.to.record.Rd deleted file mode 100644 index 992693c..0000000 --- a/man/add.episode.to.record.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccRecord.R -\name{add.episode.to.record} -\alias{add.episode.to.record} -\title{Adding one ccEpisode object to ccRecord object.} -\usage{ -add.episode.to.record(rec, episode) -} -\arguments{ -\item{rec}{ccRecord-class} - -\item{episode}{ccEpisode-class} -} -\value{ -ccRecord object -} -\description{ -Adding one ccEpisode object to ccRecord object. -} - diff --git a/man/add.record.to.record.Rd b/man/add.record.to.record.Rd deleted file mode 100644 index dfc3d99..0000000 --- a/man/add.record.to.record.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccRecord.R -\name{add.record.to.record} -\alias{add.record.to.record} -\title{Combine two ccRecord objects} -\usage{ -add.record.to.record(rec1, rec2) -} -\arguments{ -\item{rec1}{ccRecord object} - -\item{rec2}{ccRecord object} -} -\value{ -ccRecord object -} -\description{ -Combine two ccRecord objects and re-calculate the infortb -} - diff --git a/man/ccEpisode-class.Rd b/man/ccEpisode-class.Rd index a60e2a0..a911564 100644 --- a/man/ccEpisode-class.Rd +++ b/man/ccEpisode-class.Rd @@ -4,9 +4,9 @@ \name{ccEpisode-class} \alias{ccEpisode} \alias{ccEpisode-class} -\title{The S3 class which holds data of a single episode.} +\title{The S4 class which holds data of a single episode.} \description{ -The S3 class which holds data of a single episode. +The S4 class which holds data of a single episode. } \section{Fields}{ diff --git a/man/ccRecord-class.Rd b/man/ccRecord-class.Rd index 5429a61..25fc3ea 100644 --- a/man/ccRecord-class.Rd +++ b/man/ccRecord-class.Rd @@ -4,10 +4,10 @@ \name{ccRecord-class} \alias{ccRecord} \alias{ccRecord-class} -\title{The S3 class which holds all the CCHIC patient record - served as a database.} +\title{The S4 class which holds all the CCHIC patient record - served as a database.} \description{ -ccRecord is a class to hold the raw episode data parsed directly from XML or -CSV files. +ccRecord is a class to hold the raw episode data parsed directly +from XML or CSV files. } \section{Fields}{ diff --git a/man/ccRecord_subset_files.Rd b/man/ccRecord_subset_files.Rd deleted file mode 100644 index bbe5027..0000000 --- a/man/ccRecord_subset_files.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccRecord.R -\name{ccRecord_subset_files} -\alias{ccRecord_subset_files} -\title{Subset episodes from the specified XML files.} -\usage{ -ccRecord_subset_files(ccd, files) -} -\arguments{ -\item{ccd}{ccRecord object} - -\item{files}{character a vector of XML file names - see ccRecord: parse_file} -} -\value{ -ccRecord object -} -\description{ -Subset episodes from the specified XML files. -} - diff --git a/man/ccTable-class.Rd b/man/ccTable-class.Rd index 2b243fa..e890ada 100644 --- a/man/ccTable-class.Rd +++ b/man/ccTable-class.Rd @@ -4,12 +4,14 @@ \name{ccTable-class} \alias{ccTable} \alias{ccTable-class} -\title{Rearrange and clean the critical care record into a 2D table.} +\title{Process the EHR data in table format} \description{ -Data rearranging and major data cleaning processes will be performed under -the ccTable structre. It holds the original record (ccRecord), the dirty table -(torigin) clean table (tclean) and various data quality information (dquality). -Various data filters can also be found within the ccTable class. +ccRecord data are re-arranged into tables where the columns stands for +data fields (e.g. heart rate, blood pressure) and the rows stands for +each data record within a unique cadence. See ccTable_create_cctable. +ccTable is the data processing platform. It stores both original data +and processed data alongside with the process details. It also contains +various commonly used data filters. } \section{Fields}{ @@ -18,9 +20,9 @@ Various data filters can also be found within the ccTable class. \item{\code{conf}}{the YAML style configuration.} -\item{\code{torigin}}{the original wide data table.} +\item{\code{torigin}}{the original data table.} -\item{\code{tclean}}{the wide data table after cleaning processes.} +\item{\code{tclean}}{the data table after cleaning processes.} \item{\code{dfilter}}{list contains data filtering information.} @@ -33,29 +35,29 @@ Various data filters can also be found within the ccTable class. \section{Methods}{ \describe{ -\item{\code{apply.filters(warnings = T)}}{Apply all filters specified in the configuration to update the clean +\item{\code{apply_filters(warnings = TRUE)}}{Apply all filters specified in the configuration to update the clean table (tclean)} -\item{\code{create.table(freq)}}{Create a table contains the selected items in the conf with a given +\item{\code{create_table(freq)}}{Create a table contains the selected items in the conf with a given frequency (in hour)} -\item{\code{export.csv(file = NULL)}}{Export the cleaned table to a CSV file.} +\item{\code{export_csv(file = NULL)}}{Export the cleaned table to a CSV file.} -\item{\code{filter.category()}}{Check individual entries if they are the in the categories specified +\item{\code{filter_categories()}}{Check individual entries if they are the in the categories specified in conf.} -\item{\code{filter.missingness(recount = FALSE)}}{filter out the where missingness is too low.} +\item{\code{filter_missingness(recount = FALSE)}}{filter out the where missingness is too low.} -\item{\code{filter.null(items = c("episode_id", "site"))}}{remove the entire episode when any of the selected items is NULL} +\item{\code{filter_nodata()}}{Exclude episodes when no data is presented in certain fields} \item{\code{imputation()}}{Filling missing data to a time series data by performing a given imputation method on a selected window period nearby the missing data.} -\item{\code{reload.conf(file)}}{reload yaml configuration.} +\item{\code{reload_conf(conf)}}{reload yaml configuration.} }} \examples{ rec <- ccRecord() -cctable <- create.cctable(rec, freq=1) +cctable <- create_cctable(rec, freq=1) cctable <- cctable$clean() #table <- cctable$tclean } diff --git a/man/ccTable_apply_filters.Rd b/man/ccTable_apply_filters.Rd new file mode 100644 index 0000000..30eb049 --- /dev/null +++ b/man/ccTable_apply_filters.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_apply_filters} +\alias{ccTable_apply_filters} +\title{Apply all the setup filters.} +\arguments{ +\item{warnings}{logical value to indicate more or less messages with an +default value TRUE.} +} +\description{ +Once filters are applied, the processed data will be stored in tclean. Note, +running filtering function before apply_filters is necessary. This function +will have no effect on tclean if no filter is ran prior. +Filters will decide to preserve or remove particular entries or episodes. +} +\examples{ +\dontrun{ +tb <- create_cctable(ccd, conf, 1) +tb$range_filter() +tb$apply_filter() # apply only the range filter ragardless of the conf. +} +} + diff --git a/man/ccTable_clean.Rd b/man/ccTable_clean.Rd new file mode 100644 index 0000000..8065ff0 --- /dev/null +++ b/man/ccTable_clean.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_clean} +\alias{ccTable_clean} +\title{Apply all the filters} +\description{ +All the filters in configuration will be applied to create the +clean dataset. The filters include range, categories, missingness, +no_data. +} +\examples{ +\dontrun{ +tb <- create_cctable(ccd, conf, 1) +tb$clean() +} +} + diff --git a/man/ccTable_create_cctable.Rd b/man/ccTable_create_cctable.Rd new file mode 100644 index 0000000..cb8b672 --- /dev/null +++ b/man/ccTable_create_cctable.Rd @@ -0,0 +1,10 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_create_cctable} +\alias{ccTable_create_cctable} +\title{Create a ccTable object} +\description{ +This is a member function of ccTable-class. Using create_cctable is a safer and +easier way to create the ccTable. See create_cctable. +} + diff --git a/man/ccTable_export_csv.Rd b/man/ccTable_export_csv.Rd new file mode 100644 index 0000000..98d3097 --- /dev/null +++ b/man/ccTable_export_csv.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_export_csv} +\alias{ccTable_export_csv} +\title{Export the clean table as a CSV file} +\arguments{ +\item{file}{the full path of the output CSV file.} +} +\description{ +Export tclean as a CSV file. +} + diff --git a/man/ccTable_filter_categories.Rd b/man/ccTable_filter_categories.Rd new file mode 100644 index 0000000..d03e94c --- /dev/null +++ b/man/ccTable_filter_categories.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.categorical.R +\name{ccTable_filter_categories} +\alias{ccTable_filter_categories} +\title{Categorical data filter} +\description{ +Categorical variables only allow a set of values to appear in the variable +. Due to various reasons, a categorical variable may contain values that are not +standard. The allowed values can be set in the YAML configuration while initialising +the ccTable (see ccTable-class, create_cctable). +In the following example, we can see how to set up the categorical filter +for the variable dead_icu (NIHR_HIC_ICU_0097) which only allows its value to +be A, D, E. +} +\examples{ +\dontrun{ +# Example for categorical filter setup in the YAML configuration +NIHR_HIC_ICU_0097: + category: + levels: + A: Alive + D: Dead + E: Alive - not discharged + apply: drop_entry + +# Run the filter on ccTable ct +ct$filter_categories() # run the filter +ct$apply_filters() # apply the filter and create the clean table +} +} + diff --git a/man/ccTable_filter_missingness.Rd b/man/ccTable_filter_missingness.Rd new file mode 100644 index 0000000..de547b7 --- /dev/null +++ b/man/ccTable_filter_missingness.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.missingness.R +\name{ccTable_filter_missingness} +\alias{ccTable_filter_missingness} +\title{Data missing filter} +\arguments{ +\item{recount}{logical value. Recount the missingness if TRUE.} +} +\description{ +Deal with data when insufficient data points are supported. There are +two key items to be set in the YAML configuration file. +1) labels -- time interval. 2) accept_2d -- the accept present ratio. +So if we set the labels is 24, and accept_2d is 70. It means we accept +all the missing rate that is lower than 30% every 24 data points. +} + diff --git a/man/ccTable_filter_nodata.Rd b/man/ccTable_filter_nodata.Rd new file mode 100644 index 0000000..df9b9de --- /dev/null +++ b/man/ccTable_filter_nodata.Rd @@ -0,0 +1,10 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.missingness.R +\name{ccTable_filter_nodata} +\alias{ccTable_filter_nodata} +\title{No data filter} +\description{ +Remove the episode when a particular field is not presented. +It need to be set up in the YAML configuration file. +} + diff --git a/man/ccTable_filter_range.Rd b/man/ccTable_filter_range.Rd new file mode 100644 index 0000000..10fd60c --- /dev/null +++ b/man/ccTable_filter_range.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.range.R +\name{ccTable_filter_range} +\alias{ccTable_filter_range} +\title{Numerical range filter} +\arguments{ +\item{select}{the range label - "red", "amber", "green" +If I give "yellow to select, it means I only want the values which is +labeled as "yellow" to be in the clean table.} +} +\description{ +Range filter can only be applied on numerical fields. +For those fields which requires a range filter to be applied, +one needs to set a series ranges from the broadest to the narrowest in +the YAML configuration. We can set three levels (labels) of ranges, red, amber, and +green. It is also OK to set only one range instead of three. +The range filter will first assign a label to every data entry. +} +\details{ +The range in the YAML configuration file can be (l, h), [l, h], (l, h], [h, l) +standing for close, open and half open intervals. +} +\examples{ +\dontrun{ +# YAML example +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + range: + labels: + red: (0, 300) # broader + amber: (11, 170) + green: (60, 100) # narrower + apply: drop_entry +# apply range filter on ccTable ct +ct$filter_range("yellow") +ct$apply_filters +} +} + diff --git a/man/ccTable_reload_conf.Rd b/man/ccTable_reload_conf.Rd new file mode 100644 index 0000000..fa80798 --- /dev/null +++ b/man/ccTable_reload_conf.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_reload_conf} +\alias{ccTable_reload_conf} +\title{Reload the YAML configuration file} +\arguments{ +\item{conf}{full path of the YAML configuration file or the parsed config list.} +} +\description{ +Note, this function will also reset all the operations and +remove the tclean. +} +\examples{ +\dontrun{ +tb$reload_conf("REF.yaml") +} +} + diff --git a/man/ccTable_reset.Rd b/man/ccTable_reset.Rd new file mode 100644 index 0000000..e37b0c4 --- /dev/null +++ b/man/ccTable_reset.Rd @@ -0,0 +1,10 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{ccTable_reset} +\alias{ccTable_reset} +\title{Reset the ccTable} +\description{ +Restore the object to its initial status. All the filters, quality and the +cleaned table will be removed. +} + diff --git a/man/ccd_demographic_spell.Rd b/man/ccd_demographic_spell.Rd new file mode 100644 index 0000000..07ddb80 --- /dev/null +++ b/man/ccd_demographic_spell.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/demographics.R +\name{ccd_demographic_spell} +\alias{ccd_demographic_spell} +\title{Create demographic table with spell IDs} +\usage{ +ccd_demographic_spell(rec, duration = 2) +} +\arguments{ +\item{rec}{ccRecord} + +\item{duration}{the maximum hours of transition period} +} +\value{ +data.table demographic table with spell ID in column spell +} +\description{ +same output like ccd_demographic_table but in +addition with a spell ID. +} + diff --git a/man/ccd_demographic_table.Rd b/man/ccd_demographic_table.Rd new file mode 100644 index 0000000..108a5d1 --- /dev/null +++ b/man/ccd_demographic_table.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/demographics.R +\name{ccd_demographic_table} +\alias{ccd_demographic_table} +\title{Create the demographic tables, which includes all non-time-varying variables.} +\usage{ +ccd_demographic_table(record, dtype = TRUE) +} +\arguments{ +\item{record}{ccRecord-class} + +\item{dtype}{logical column will be type aware, else all in character.} +} +\description{ +The data type of each column is in its corresponding data +type. +} + diff --git a/man/selectTable.Rd b/man/ccd_select_table.Rd similarity index 64% rename from man/selectTable.Rd rename to man/ccd_select_table.Rd index 7231a63..de8eac0 100644 --- a/man/selectTable.Rd +++ b/man/ccd_select_table.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/selectTable.R -\name{selectTable} -\alias{selectTable} -\title{Create wide table from ccRecord} +% Please edit documentation in R/ccTable.R +\name{ccd_select_table} +\alias{ccd_select_table} +\title{Create the table for ccTable from ccRecord} \usage{ -selectTable(record, items_opt = NULL, items_obg = NULL, freq, +ccd_select_table(record, items_opt = NULL, items_obg = NULL, freq, return_list = FALSE) } \arguments{ @@ -13,7 +13,7 @@ selectTable(record, items_opt = NULL, items_obg = NULL, freq, \item{items_opt}{character vectors. Items (HIC code) selected in item_opt are optional items, which will be automatically filled when item is missing.} -\item{items_obg}{obligatory items that is obligatory; Any episode that doesn't contain +\item{items_obg}{obligatory items that is obligatory; Any episode that does not contain item in this vector will be removed.} \item{freq}{numeric cadence in hour.} @@ -24,6 +24,6 @@ item in this vector will be removed.} data.table } \description{ -Create wide table from ccRecord +Create the table for ccTable from ccRecord } diff --git a/man/unique_spell.Rd b/man/ccd_unique_spell.Rd similarity index 64% rename from man/unique_spell.Rd rename to man/ccd_unique_spell.Rd index 6dfd9f5..9cff4b1 100644 --- a/man/unique_spell.Rd +++ b/man/ccd_unique_spell.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/unique.spell.R -\name{unique_spell} -\alias{unique_spell} +% Please edit documentation in R/demographics.R +\name{ccd_unique_spell} +\alias{ccd_unique_spell} \title{find the unique spell ID.} \usage{ -unique_spell(rec, duration = 2) +ccd_unique_spell(rec, duration = 2) } \arguments{ \item{rec}{ccRecord-class} diff --git a/man/create.cctable.Rd b/man/create.cctable.Rd deleted file mode 100644 index 8d35f1b..0000000 --- a/man/create.cctable.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccTable.R -\name{create.cctable} -\alias{create.cctable} -\title{construct function of ccTable object} -\usage{ -create.cctable(rec, freq, conf = NULL) -} -\arguments{ -\item{rec}{ccRecord} - -\item{freq}{the data cadence in hour.} - -\item{conf}{either the path of YAML configuration file or the configuration -structure in list.} -} -\value{ -ccTable object -} -\description{ -construct function of ccTable object -} - diff --git a/man/create2dclean.Rd b/man/create2dclean.Rd index 27b7448..cf33900 100644 --- a/man/create2dclean.Rd +++ b/man/create2dclean.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/create2dclean.R +% Please edit documentation in R/ccTable.R \name{create2dclean} \alias{create2dclean} -\title{Create a 2D wide clean table - low memory} +\title{Clean table - low memory} \usage{ create2dclean(record, config, freq = 1, nchunks = 1) } diff --git a/man/create_cctable.Rd b/man/create_cctable.Rd new file mode 100644 index 0000000..f6d12c8 --- /dev/null +++ b/man/create_cctable.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccTable.R +\name{create_cctable} +\alias{create_cctable} +\title{Create a ccTable object} +\usage{ +create_cctable(rec, conf = NULL, freq = 1) +} +\arguments{ +\item{rec}{ccRecord} + +\item{conf}{either the path of YAML configuration file or the configuration} + +\item{freq}{a unique sampling frequency (in hours) for all variables. e.g. if freq is set to +1, each row in ccTable will represent a record of one hour.} +} +\value{ +ccTable +} +\description{ +Re-arrange the ccRecord object to table format where each column stands +for a variable and each row a record data point. The number of rows will +depend on the sampling frequency set in this function. If the original data +has a higher recording frequency than the set frequency (freq), the closest +data point will be taken. It is suggested the `freq` should not be set +lower than the maximum recording frequency in the original dataset. +} + diff --git a/man/data.checklist.Rd b/man/data.checklist.Rd index e2a8ce5..b728ecd 100644 --- a/man/data.checklist.Rd +++ b/man/data.checklist.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R +% Please edit documentation in R/data.R \docType{data} \name{data.checklist} \alias{data.checklist} diff --git a/man/data.quality.report.Rd b/man/data.quality.report.Rd index 224e47e..26c91c1 100644 --- a/man/data.quality.report.Rd +++ b/man/data.quality.report.Rd @@ -4,7 +4,7 @@ \alias{data.quality.report} \title{Create the data quality report} \usage{ -data.quality.report(ccd, site = NULL, file = NULL, pdf = T, +data.quality.report(ccd, site = NULL, file = NULL, pdf = TRUE, out = "report") } \arguments{ @@ -12,7 +12,7 @@ data.quality.report(ccd, site = NULL, file = NULL, pdf = T, \item{site}{a vector of the site ids for the site specified report.} -\item{file}{charcter a list of XML file origins.} +\item{file}{character a list of XML file origins.} \item{pdf}{logical create the pdf version of the DQ report, otherwise stay in markdown format} diff --git a/man/data.quality.report.brc.Rd b/man/data.quality.report.brc.Rd index 999f720..9a1a7a1 100644 --- a/man/data.quality.report.brc.Rd +++ b/man/data.quality.report.brc.Rd @@ -4,7 +4,7 @@ \alias{data.quality.report.brc} \title{Create the data quality report} \usage{ -data.quality.report.brc(ccd, pdf = T, brc = NULL, path = NULL) +data.quality.report.brc(ccd, pdf = TRUE, brc = NULL, path = NULL) } \arguments{ \item{ccd}{ccRecord} diff --git a/man/deltaTime.Rd b/man/deltaTime.Rd index 98a1d67..72f36eb 100644 --- a/man/deltaTime.Rd +++ b/man/deltaTime.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/deltaTime.R \name{deltaTime} \alias{deltaTime} -\title{convert calendar time data in a record to delta time comparing to the ICU +\title{Convert calendar date-time to the time difference comparing to the ICU admission time.} \usage{ deltaTime(record, pseudotime = FALSE, units = "hours", tdiff = FALSE) @@ -19,7 +19,7 @@ in the record.} \item{tdiff}{if false the delta time will be written in numeric format.} } \description{ -convert calendar time data in a record to delta time comparing to the ICU +Convert calendar date-time to the time difference comparing to the ICU admission time. } diff --git a/man/demg.distribution.Rd b/man/demg.distribution.Rd index 657aa32..8afc360 100644 --- a/man/demg.distribution.Rd +++ b/man/demg.distribution.Rd @@ -8,7 +8,7 @@ Create a plot of the distribution of numerical demographic data.} demg.distribution(demg, names) } \arguments{ -\item{demg}{ccRecord or demographic table created by sql.demographic.table()} +\item{demg}{ccRecord or demographic table created by ccd_demographic_table()} \item{names}{character vector of short names of numerical demographic data.} } diff --git a/man/demographic.data.completeness.Rd b/man/demographic.data.completeness.Rd index 9618dfd..162dd40 100644 --- a/man/demographic.data.completeness.Rd +++ b/man/demographic.data.completeness.Rd @@ -7,7 +7,7 @@ demographic.data.completeness(demg, names = NULL, return.data = FALSE) } \arguments{ -\item{demg}{data.table the demographic data table created by sql.demographic.table()} +\item{demg}{data.table the demographic data table created by ccd_demographic_table()} \item{names}{short name of selected items} diff --git a/man/demographic.patient.spell.Rd b/man/demographic.patient.spell.Rd deleted file mode 100644 index 3adcae3..0000000 --- a/man/demographic.patient.spell.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/unique.spell.R -\name{demographic.patient.spell} -\alias{demographic.patient.spell} -\title{Assign unique spell ID to the demographic table} -\usage{ -demographic.patient.spell(rec, duration = 2) -} -\arguments{ -\item{rec}{ccRecord} - -\item{duration}{the maximum hours of transition period} -} -\value{ -data.table demographic table with spell ID in column spell -} -\description{ -Assign unique spell ID to the demographic table -} - diff --git a/man/episode.graph.Rd b/man/episode.graph.Rd deleted file mode 100644 index 9f34368..0000000 --- a/man/episode.graph.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summary.R -\name{episode.graph} -\alias{episode.graph} -\title{Individual episode graph} -\usage{ -episode.graph(ccd, eid = 601, items = NULL) -} -\arguments{ -\item{ccd}{ccRecord} - -\item{eid}{character the episode index in the ccRecord} - -\item{items}{character NIHC code of longitudinal data.} -} -\description{ -Create an individual episode graph for its diagnosis, drugs and physiological -variables. Diagnosis and drugs are always included, while the user can -select other longitudinal data. -} - diff --git a/man/extractIndexTable.Rd b/man/extractIndexTable.Rd deleted file mode 100644 index 73d3390..0000000 --- a/man/extractIndexTable.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R -\name{extractIndexTable} -\alias{extractIndexTable} -\title{get indexing tables for time label, time-wise value, meta data label, and -meta data.} -\usage{ -extractIndexTable() -} -\value{ -list of vectors contains time.index, datat.index, meta.index, - datam.index -} -\description{ -get indexing tables for time label, time-wise value, meta data label, and -meta data. -} - diff --git a/man/extract_file_origin.Rd b/man/extract_file_origin.Rd index ad0cc79..87a48e4 100644 --- a/man/extract_file_origin.Rd +++ b/man/extract_file_origin.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml2ccdata.R +% Please edit documentation in R/cchic_xml.R \name{extract_file_origin} \alias{extract_file_origin} \title{Extract the original file name from a path and file removing diff --git a/man/extractInfo.Rd b/man/extract_info.Rd similarity index 62% rename from man/extractInfo.Rd rename to man/extract_info.Rd index 44f0f83..8e17d41 100644 --- a/man/extractInfo.Rd +++ b/man/extract_info.Rd @@ -1,16 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utilities.R -\name{extractInfo} -\alias{extractInfo} -\title{extract information from data.checklist} +\name{extract_info} +\alias{extract_info} +\title{Extract information from data.checklist} \usage{ -extractInfo() +extract_info() } \value{ list of time [data.frame(id, idt)], meta [data.frame(id, idmeta)], nontime [numeric], MAX_NUM_NHIC } \description{ -extract information from data.checklist +Extract information from data.checklist } diff --git a/man/getEpisodePeriod.Rd b/man/getEpisodePeriod.Rd index b28da58..3fb86ec 100644 --- a/man/getEpisodePeriod.Rd +++ b/man/getEpisodePeriod.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reallocateTime.R +% Please edit documentation in R/ccTable.R \name{getEpisodePeriod} \alias{getEpisodePeriod} \title{Get the length of stay based on the first and the last data point.} diff --git a/man/getXmlepisode.Rd b/man/getXmlepisode.Rd index 13f7f29..aa9c9d7 100644 --- a/man/getXmlepisode.Rd +++ b/man/getXmlepisode.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml2ccdata.R +% Please edit documentation in R/cchic_xml.R \name{getXmlepisode} \alias{getXmlepisode} \title{get the episode data from xml} diff --git a/man/getfilter.Rd b/man/getfilter.Rd deleted file mode 100644 index 2e99965..0000000 --- a/man/getfilter.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ccTable.R -\name{getfilter} -\alias{getfilter} -\title{get the dfilter} -\usage{ -getfilter(dq, criterion) -} -\arguments{ -\item{dq}{can be either dqaulity table or torigin} - -\item{criterion}{should be a function to give T/F values of each entry.} -} -\description{ -get the dfilter -} - diff --git a/man/icnarc.Rd b/man/icnarc_table.Rd similarity index 76% rename from man/icnarc.Rd rename to man/icnarc_table.Rd index 314b945..f96c3d1 100644 --- a/man/icnarc.Rd +++ b/man/icnarc_table.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R +% Please edit documentation in R/data.R \docType{data} -\name{icnarc} -\alias{icnarc} +\name{icnarc_table} +\alias{icnarc_table} \title{ICNARC diagnosis reference table} \description{ ICNARC diagnosis reference table diff --git a/man/inrange.Rd b/man/inrange.Rd index ef53e4c..f5e5707 100644 --- a/man/inrange.Rd +++ b/man/inrange.Rd @@ -12,7 +12,7 @@ inrange(v, range) \item{range}{A string contains the numeric ranges in a form such as (low, up) for open range and [low, up] for close range. Multiple ranges should be separated by semi-columns which is equivalent to logical -OR. e.g. (low1, up1); (low2, up2)} +OR e.g. (low1, up1); (low2, up2)} } \description{ Check if the values of a vector v is in the given ranges. diff --git a/man/lenstay.Rd b/man/lenstay.Rd index 8fe7073..936e4f5 100644 --- a/man/lenstay.Rd +++ b/man/lenstay.Rd @@ -7,7 +7,7 @@ lenstay(demg, units = "hours") } \arguments{ -\item{demg}{data.table the demograhic table which should at least contain +\item{demg}{data.table the demographic table which should at least contain column DAICU and DDICU} \item{units}{character The unit of lenstay column, by default the output will be in hours} diff --git a/man/long2stname.Rd b/man/long2stname.Rd new file mode 100644 index 0000000..8755cab --- /dev/null +++ b/man/long2stname.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stdid.R +\name{long2stname} +\alias{long2stname} +\title{Convert long names to short names.} +\usage{ +long2stname(l) +} +\arguments{ +\item{l}{long name such as "heart rate"} +} +\value{ +short name character such as "h_rate" +} +\description{ +Convert long names to short names. +} + diff --git a/man/parse.new.xml.Rd b/man/parse.new.xml.Rd deleted file mode 100644 index b5068f1..0000000 --- a/man/parse.new.xml.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline.R -\name{parse.new.xml} -\alias{parse.new.xml} -\title{Update the RData database} -\usage{ -parse.new.xml(xml.path, mc.cores = 4, quiet = FALSE) -} -\arguments{ -\item{xml.path}{the path of the folder of which contains the XML files.} - -\item{mc.cores}{number of processors to be applied for parallelisation.} - -\item{quiet}{logical switch on/off of the progress bar.} -} -\value{ -ccRecord object -combine new data to a ccRecord -} -\description{ -Inject episode data from the newly added XML files to the RData database. -} - diff --git a/man/plot-ccEpisode-character-method.Rd b/man/plot-ccEpisode-character-method.Rd new file mode 100644 index 0000000..11d7e36 --- /dev/null +++ b/man/plot-ccEpisode-character-method.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccRecord.R +\docType{methods} +\name{plot,ccEpisode,character-method} +\alias{plot,ccEpisode,character-method} +\title{Episode chart} +\usage{ +\S4method{plot}{ccEpisode,character}(r, v) +} +\arguments{ +\item{r}{ccEpisode-class} + +\item{v}{character} +} +\description{ +Episode chart +} + diff --git a/man/plot-ccEpisode-missing-method.Rd b/man/plot-ccEpisode-missing-method.Rd new file mode 100644 index 0000000..2e45a74 --- /dev/null +++ b/man/plot-ccEpisode-missing-method.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccRecord.R +\docType{methods} +\name{plot,ccEpisode,missing-method} +\alias{plot,ccEpisode,missing-method} +\title{Episode chart default fields} +\usage{ +\S4method{plot}{ccEpisode,missing}(r) +} +\arguments{ +\item{r}{ccEpisode-class} +} +\description{ +Episode chart default fields +} + diff --git a/man/plot.Rd b/man/plot.Rd new file mode 100644 index 0000000..8bd6a30 --- /dev/null +++ b/man/plot.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccRecord.R +\name{plot} +\alias{plot} +\title{Individual episode chart} +\usage{ +plot(r, v) +} +\arguments{ +\item{r}{ccEpisode-class} + +\item{v}{short name of longitudinal data. While v is not given, the chart +will only display h_rate, spo2, bilirubin, platelets, pao2_fio2, gcs_total.} +} +\value{ +a table of selected vars of an episode +} +\description{ +Create an individual episode chart for its diagnosis, drugs and physiological +variables. Diagnosis and drugs are always included, while the user can +select other longitudinal data. +} +\examples{ +\dontrun{ +plot(ccd@episodes[[1]]) # plot first episode with default variables. +plot(ccd@episodes[[1]], "h_rate") # plot first episode heart rate +} +} + diff --git a/man/plus-ccRecord-NULL-method.Rd b/man/plus-ccRecord-NULL-method.Rd index 0775e13..aae4541 100644 --- a/man/plus-ccRecord-NULL-method.Rd +++ b/man/plus-ccRecord-NULL-method.Rd @@ -3,7 +3,7 @@ \docType{methods} \name{+,ccRecord,NULL-method} \alias{+,ccRecord,NULL-method} -\title{Adding nothing to a ccRecord object.} +\title{Adding nothing to a ccRecord object and return the original ccRecord} \usage{ \S4method{+}{ccRecord,`NULL`}(e1, e2) } @@ -13,6 +13,6 @@ \item{e2}{NULL} } \description{ -Adding nothing to a ccRecord object. +Adding nothing to a ccRecord object and return the original ccRecord } diff --git a/man/plus-ccRecord-list-method.Rd b/man/plus-ccRecord-list-method.Rd index 587062f..4ec908a 100644 --- a/man/plus-ccRecord-list-method.Rd +++ b/man/plus-ccRecord-list-method.Rd @@ -3,19 +3,21 @@ \docType{methods} \name{+,ccRecord,list-method} \alias{+,ccRecord,list-method} -\title{Adding a list of ccEpisode objects to a ccRecord} +\title{Adding a list of ccEpisode to ccRecord} \usage{ \S4method{+}{ccRecord,list}(e1, e2) } \arguments{ -\item{e1}{ccRecord-class} +\item{e1}{ccRecord} -\item{e2}{A list of ccEpisode objects} +\item{e2}{a list of ccEpisode objects} } \value{ -ccRecord-class +ccRecord } \description{ -Adding a list of ccEpisode objects to a ccRecord +Adding a list of one or multiple ccEpisode objects to a +ccRecord object, the information table (infotb) will be updated automatically. +It is the more efficient way to add multiple ccEpisode objects. } diff --git a/man/reallocateTimeRecord.Rd b/man/reallocateTimeRecord.Rd index a0f3b6c..ce41eda 100644 --- a/man/reallocateTimeRecord.Rd +++ b/man/reallocateTimeRecord.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reallocateTime.R +% Please edit documentation in R/ccTable.R \name{reallocateTimeRecord} \alias{reallocateTimeRecord} \title{Propagate a numerical delta time interval record.} diff --git a/man/sql.demographic.table.Rd b/man/sql.demographic.table.Rd deleted file mode 100644 index be5bb66..0000000 --- a/man/sql.demographic.table.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sql_demographic.R -\name{sql.demographic.table} -\alias{sql.demographic.table} -\title{Create demographic SQL tables. The data type of each column is in its -corresponding data type.} -\usage{ -sql.demographic.table(record, dtype = TRUE) -} -\arguments{ -\item{record}{ccRecord-class} - -\item{dtype}{logical column will be type aware, else all in character.} -} -\description{ -Create demographic SQL tables. The data type of each column is in its -corresponding data type. -} - diff --git a/man/sub-ccRecord-character-method.Rd b/man/sub-ccRecord-character-method.Rd index a33c0d5..3f83736 100644 --- a/man/sub-ccRecord-character-method.Rd +++ b/man/sub-ccRecord-character-method.Rd @@ -3,7 +3,7 @@ \docType{methods} \name{[,ccRecord,character-method} \alias{[,ccRecord,character-method} -\title{Create a ccRecord subset via selected sites.} +\title{Create a ccRecord subsetting via selected sites.} \usage{ \S4method{[}{ccRecord,character}(x, i) } @@ -13,6 +13,6 @@ \item{i}{character vector which contains site_ids, e.g. c("Q70", "Q70W")} } \description{ -Create a ccRecord subset via selected sites. +Create a ccRecord subsetting via selected sites. } diff --git a/man/sub-sub-ccRecord-method.Rd b/man/sub-sub-ccRecord-method.Rd index 905e303..43ad7a3 100644 --- a/man/sub-sub-ccRecord-method.Rd +++ b/man/sub-sub-ccRecord-method.Rd @@ -3,7 +3,7 @@ \docType{methods} \name{[[,ccRecord-method} \alias{[[,ccRecord-method} -\title{Subseting a ccRecord object and return a list of ccEpisode objects.} +\title{Subsetting a ccRecord object and return a list of ccEpisode objects.} \usage{ \S4method{[[}{ccRecord}(x, i) } @@ -13,6 +13,6 @@ \item{i}{integer vector} } \description{ -Subseting a ccRecord object and return a list of ccEpisode objects. +Subsetting a ccRecord object and return a list of ccEpisode objects. } diff --git a/man/subset-ccRecord-character-method.Rd b/man/subset-ccRecord-character-method.Rd new file mode 100644 index 0000000..14ac423 --- /dev/null +++ b/man/subset-ccRecord-character-method.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccRecord.R +\docType{methods} +\name{subset,ccRecord,character-method} +\alias{subset,ccRecord,character-method} +\title{Get a subset of episodes that have the same from ccRecord .} +\usage{ +\S4method{subset}{ccRecord,character}(r, f) +} +\arguments{ +\item{r}{ccRecord-class} + +\item{f}{character a vector of XML file names - see ccRecord: parse_file} +} +\value{ +ccRecord object +} +\description{ +Get a subset of episodes that have the same from ccRecord . +} + diff --git a/man/subset.Rd b/man/subset.Rd new file mode 100644 index 0000000..ac065f0 --- /dev/null +++ b/man/subset.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccRecord.R +\name{subset} +\alias{subset} +\title{Get a subset of episodes from ccRecord.} +\usage{ +subset(r, f) +} +\arguments{ +\item{r}{ccRecord-class} + +\item{f}{character a vector of XML file names - see ccRecord: parse_file} +} +\value{ +ccRecord-class +} +\description{ +Get a subset of episodes from ccRecord. +} + diff --git a/man/table1.Rd b/man/table1.Rd index d12a49e..af08878 100644 --- a/man/table1.Rd +++ b/man/table1.Rd @@ -7,7 +7,7 @@ table1(demg, names, return.data = FALSE) } \arguments{ -\item{demg}{demographic table created by sql.demographic.table()} +\item{demg}{demographic table created by ccd_demographic_table()} \item{names}{character string. Short names of data items, e.g. h_rate.} diff --git a/man/update_database.Rd b/man/update_database.Rd deleted file mode 100644 index a243885..0000000 --- a/man/update_database.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pipeline.R -\name{update_database} -\alias{update_database} -\title{Update the critical care database (RData)} -\usage{ -update_database(xml.path, restart = FALSE, splitxml = FALSE, mc.cores = 4, - quiet = FALSE) -} -\arguments{ -\item{xml.path}{character the path of the folder of which contains the XML files.} - -\item{restart}{logical purge the previous database and restart parsing for all the XML files presented.} - -\item{splitxml}{logical break down the XML files into chuncks. (Do it when the file is too big)} - -\item{mc.cores}{integer number of processors to be applied for parallelisation.} - -\item{quiet}{logical show the progress bar if true} -} -\description{ -Parse critical care data from XML files and inject them into the RData -database. -} - diff --git a/man/xml2Data.Rd b/man/xml2Data.Rd index 85fa300..c6511a1 100644 --- a/man/xml2Data.Rd +++ b/man/xml2Data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml2ccdata.R +% Please edit documentation in R/cchic_xml.R \name{xml2Data} \alias{xml2Data} \title{Convert the XML file to ccRecord} diff --git a/man/xmlLoad.Rd b/man/xmlLoad.Rd index 76ffe07..a3b1e7a 100644 --- a/man/xmlLoad.Rd +++ b/man/xmlLoad.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml2ccdata.R +% Please edit documentation in R/cchic_xml.R \name{xmlLoad} \alias{xmlLoad} \title{load xml clinical data} diff --git a/paper.md b/paper.md index 792cc4f..d1900e0 100644 --- a/paper.md +++ b/paper.md @@ -41,10 +41,10 @@ bibliography: paper.bib --- # Summary -cleanEHR works with the Critical Care Health Informatics Collaborative database, -which collects and gathers high resolution longitudinal patient record from -critical care units at Cambridge, Guys/Kings/St Thomas', Imperial, Oxford, UCL -Hospitals. +cleanEHR is a data cleaning and wrangling platform which works with the +Critical Care Health Informatics Collaborative database. It collects and +gathers high resolution longitudinal patient record from critical care units at +Cambridge, Guys/Kings/St Thomas', Imperial, Oxford, UCL Hospitals. The purpose of cleanEHR is to enable researchers to answer clinical questions that are important to patients. cleanEHR is created to address various data diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b4ad13f..ff917c0 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -7,7 +7,7 @@ using namespace Rcpp; // reallocateTime_ DataFrame reallocateTime_(DataFrame d, const float t_discharge, const float frequency); -RcppExport SEXP cleanEHR_reallocateTime_(SEXP dSEXP, SEXP t_dischargeSEXP, SEXP frequencySEXP) { +RcppExport SEXP _cleanEHR_reallocateTime_(SEXP dSEXP, SEXP t_dischargeSEXP, SEXP frequencySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -18,3 +18,13 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } + +static const R_CallMethodDef CallEntries[] = { + {"_cleanEHR_reallocateTime_", (DL_FUNC) &_cleanEHR_reallocateTime_, 3}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_cleanEHR(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/tests/data/ANALYSIS_REF.yaml b/tests/data/ANALYSIS_REF.yaml index 2d15ff3..6c64f2b 100644 --- a/tests/data/ANALYSIS_REF.yaml +++ b/tests/data/ANALYSIS_REF.yaml @@ -33,15 +33,12 @@ NIHR_HIC_ICU_0073: shortName: nhs_id dataItem: NHS number # - [ ] NOTE(2016-07-17): these are automatically imported and will cause duplicate problems if re-imported -# NIHR_HIC_ICU_0002: -# shortName: site_id -# dataItem: ICNARC CMP number -# distribution: nominal -# nodata: -# apply: drop_episode -# NIHR_HIC_ICU_0005: -# shortName: episode_id -# dataItem: Critical care local identifier / ICNARC admission number +NIHR_HIC_ICU_0002: + shortName: site_id + dataItem: ICNARC CMP number + distribution: nominal + nodata: + apply: drop_episode NIHR_HIC_ICU_0108: shortName: hrate dataItem: Heart rate diff --git a/tests/data/sample_ccd.RData b/tests/data/sample_ccd.RData new file mode 100644 index 0000000..3698f5a Binary files /dev/null and b/tests/data/sample_ccd.RData differ diff --git a/tests/testthat/test_0_load_xml.r b/tests/testthat/test_0_load_xml.r index 9df94bc..1eb5c02 100644 --- a/tests/testthat/test_0_load_xml.r +++ b/tests/testthat/test_0_load_xml.r @@ -5,8 +5,9 @@ #ccd_ <<- ccRecord() #ccd_ <<- ccd_ + ccd[2, 1] + ccd[3, 1] + load("../data/sample_ccd.RData") + tb <<- create_cctable(ccd, "../data/ANALYSIS_REF.yaml", 1) + rm("ccd") ccd <<- xml2Data("../data/test_data_anonym.xml") ccdt <<- deltaTime(ccd, pseudotime=TRUE) - - #} diff --git a/tests/testthat/test_ccdatatable.r b/tests/testthat/test_ccdatatable.r index 3448f59..79e2fe1 100644 --- a/tests/testthat/test_ccdatatable.r +++ b/tests/testthat/test_ccdatatable.r @@ -17,28 +17,28 @@ test.record <- function(v, item) { test_that("test create table",{ - ccd_delta <- suppressWarnings(deltaTime(ccd, pseudotime=T)) + ccd_delta <- suppressWarnings(deltaTime(ccd, pseudotime=TRUE)) conf <- yaml.load_file('../data/test_2yml.yml') tb <- ccTable(record=ccd_delta, conf=conf) - tb$create.table(freq=1) + tb$create_table(freq=1) # assign table to both origin and clean table expect_true(!is.null(tb$torigin)) expect_equivalent(tb$torigin, tb$tclean) }) -test_that("test get.missingness", { +test_that("test get_missingness", { cr <- ccRecord()+new.episode(list(NIHR_HIC_ICU_0108=data.frame(time=as.numeric(seq(100)), item2d=as.character(rep(10,100))))) tb <- ccTable(record=cr, conf=yaml.load_file('../data/test_2yml.yml')) - tb$create.table(freq=1) + tb$create_table(freq=1) tb$conf[[1]][['missingness']][['labels']][['yellow']] <- 1 - tb$get.missingness() + tb$get_missingness() expect_equal(tb$dquality$missingness$NIHR_HIC_ICU_0108.yellow, 100/101*100) tb$conf[[1]][['missingness']][['labels']][['yellow']] <- 0.1 - tb$get.missingness() + tb$get_missingness() expect_equal(tb$dquality$missingness$NIHR_HIC_ICU_0108.yellow, 100/1001*100) @@ -46,7 +46,7 @@ test_that("test get.missingness", { # check the case when there is no missingness table tb$dquality$missingness <- data.table(NULL) tb$tclean <- data.table(NULL) - tb$filter.missingness() + tb$filter_missingness() expect_true(any(class(tb$tclean)=="data.table")) expect_equivalent(tclean, tb$tclean) }) @@ -75,9 +75,9 @@ test_that("test filter missingness", missingness_run <- function(v, item="NIHR_HIC_ICU_0108") { cr <- test.record(v, item) tb <- ccTable(record=cr, conf=conf) - tb$create.table(freq=1) - tb$filter.missingness() - tb$apply.filters() + tb$create_table(freq=1) + tb$filter_missingness() + tb$apply_filters() expect_true(any(class(tb$tclean) == "data.table")) return(tb) } @@ -121,7 +121,7 @@ test_that("test imputation", imputation_run <- function(v, item="NIHR_HIC_ICU_0108") { cr <- test.record(v, item) tb <- ccTable(record=cr, conf=conf) - tb$create.table(freq=1) + tb$create_table(freq=1) tb$imputation() return(tb) } @@ -178,11 +178,11 @@ test_that("test imputation", #test_that("test apply filter", #{ # tb <- env$tb -# tb$filter.ranges() -# tb$filter.category() -# tb$filter.missingness() +# tb$filter_range() +# tb$filter_categories() +# tb$filter_missingness() # tb$filter.nodata() -# tb$apply.filters() +# tb$apply_filters() # tt <<- tb #}) # diff --git a/tests/testthat/test_deltaTime.r b/tests/testthat/test_deltaTime.r index 54e5d9f..602ea45 100644 --- a/tests/testthat/test_deltaTime.r +++ b/tests/testthat/test_deltaTime.r @@ -42,7 +42,7 @@ test_that("with pseudotime flag, derive the admission time from 2d data", { e1 <- list() e1[["heart_rate"]] <- data.frame(time="2014-02-01T17:00:00", item2d="80") rc <- ccRecord() + new.episode(e1) - rcd <- deltaTime(rc, pseudotime=T) + rcd <- deltaTime(rc, pseudotime=TRUE) expect_equal(rcd@nepisodes, 1) expect_equal(rcd@episodes[[1]]@data[["heart_rate"]]$time, 0) }) @@ -51,6 +51,6 @@ test_that("episode has no time data", { e1 <- list() e1[["item_id"]] <- "xxxx" rc <- ccRecord() + new.episode(e1) - expect_warning(rcd <- deltaTime(rc, pseudotime=T)) + expect_warning(rcd <- deltaTime(rc, pseudotime=TRUE)) expect_equal(rcd@nepisodes, 0) }) diff --git a/tests/testthat/test_demographic.r b/tests/testthat/test_demographic.r deleted file mode 100644 index 6250739..0000000 --- a/tests/testthat/test_demographic.r +++ /dev/null @@ -1,19 +0,0 @@ -context("Testing demographic table") - -test_that("create demographic table from ccdata and expect the equivalent - results with or without data type awareness.", { - demg <- sql.demographic.table(ccd, dtype=F) - expect_equal(nrow(demg), ccd@nepisodes) - expect_match(class(demg)[1], "data.table") - - convert.back.to.char <- function(v) { - v <- as.character(v) - v[v=="NA"] <- "NULL" - v[is.na(v)] <- "NULL" - v - } - - demg_t <- suppressWarnings(sql.demographic.table(ccd)) - for (i in seq(ncol(demg) - 1)) - expect_equivalent(demg[[i]], convert.back.to.char(demg_t[[i]])) -}) diff --git a/tests/testthat/test_demographics.r b/tests/testthat/test_demographics.r index ecda9aa..9f5262d 100644 --- a/tests/testthat/test_demographics.r +++ b/tests/testthat/test_demographics.r @@ -1,3 +1,25 @@ +context("Testing demographic table") + +test_that("create demographic table from ccdata and expect the equivalent + results with or without data type awareness.", { + demg <- ccd_demographic_table(ccd, dtype=FALSE) + expect_equal(nrow(demg), ccd@nepisodes) + expect_match(class(demg)[1], "data.table") + + convert.back.to.char <- function(v) { + v <- as.character(v) + v[v=="NA"] <- "NULL" + v[is.na(v)] <- "NULL" + v + } + + demg_t <- suppressWarnings(ccd_demographic_table(ccd)) + for (i in seq(ncol(demg) - 1)) + expect_equivalent(demg[[i]], convert.back.to.char(demg_t[[i]])) +}) + + + context("Testing functionalities for the demographic table") test_that("calculate length of stay in the ICU",{ diff --git a/tests/testthat/test_pipeline.r b/tests/testthat/test_pipeline.r deleted file mode 100644 index 94bfa8f..0000000 --- a/tests/testthat/test_pipeline.r +++ /dev/null @@ -1,105 +0,0 @@ -context("data parsing pipeline") - -test.setup <- function() { - for (i in c(".temp", ".temp/XML")) { - unlink(i, recursive=T) - dir.create(i) - } -} - -test.teardown <- function() { - unlink(".temp", recursive=T) -} - - -test_that("Discover parsed XML files and new files", { - test.setup() - f <- find.new.xml.file(".temp/XML") - expect_true(dir.exists(".temp/XML/.database")) - expect_length(f, 0) - - file.create(".temp/XML/test1.xml") - file.create(".temp/XML/test2.xml") - - f <- find.new.xml.file(".temp/XML") - expect_equivalent(f, c("test1.xml", "test2.xml")) - - file.create(".temp/XML/.database/test1.xml_someothersuffix.RData") - f <- find.new.xml.file(".temp/XML") - expect_equivalent(f, c("test2.xml")) - - # adding xml files with the same name pattern should not initiating the - # parsing process. - file.create(".temp/XML/test1.xml_1.partxml") - file.create(".temp/XML/test1.xml_2.partxml") - f <- find.new.xml.file(".temp/XML") - expect_equivalent(f, "test2.xml") - - # test if .partxml suffix can pass - file.create(".temp/XML/test2.xml_2.partxml") - file.create(".temp/XML/test2.xml_1.partxml") - file.create(".temp/XML/test2.xml_3.partxml") - f <- find.new.xml.file(".temp/XML") - - # should pass with XML as suffix - file.create(".temp/XML/file.XML") - find.new.xml.file(".temp/XML") - - file.create(".temp/XML/not_end_with_xmlsuffix") - expect_error(find.new.xml.file(".temp/XML")) - file.remove(".temp/XML/not_end_with_xmlsuffix") - - test.teardown() -}) - - -#test_that("update the new XML files", { -# test.setup() -# system("cp ../data/test_data_anonym.xml .temp/XML/test1.xml") -# system("cp ../data/test_data_anonym.xml .temp/XML/test2.xml") -# new.db <- update.new.xml(".temp/XML", quiet=T) -# expect_is(new.db, "ccRecord") -# -# xml1 <- xml2Data(".temp/XML/test1.xml") -# xml2 <- xml2Data(".temp/XML/test2.xml") -# -# expect_equal(new.db@nepisodes, xml1@nepisodes + xml2@nepisodes) -# expect_true("test1.xml.RData" %in% dir(".temp/XML/.database")) -# expect_true("test2.xml.RData" %in% dir(".temp/XML/.database")) -#}) - -#test_that("update the database", { -# alld <- update_database(xml.path=".temp/XML", restart=T) -# expect_true("alldata.RData" %in% dir(".temp/XML/.database")) -# expect_equal(alld@nepisodes, 4) -# -# alld <- update_database(xml.path=".temp/XML", restart=F) -# expect_equal(alld@nepisodes, 4) -# -# load(".temp/XML/.database/alldata.RData") -# expect_equal(alldata@nepisodes, 4) -# -#}) -# -#test_that("break down the large XML files", { -# if (Sys.info()[['sysname']] != "Darwin") { -# partxml.dir <-".temp/XML/.partxml" -# -# unlink("partxml.dir", recursive=T) -# expect_true(!file.exists("partxml.dir")) -# -# unlink(".temp/XML/.database/", recursive=T) -# break.down.xml(".temp/XML") -# expect_true(file.exists(partxml.dir)) -# expect_equal(length(dir(partxml.dir)), 2) -# -# file.create(paste(partxml.dir, "files_to_be_delete", sep="/")) -# break.down.xml(".temp/XML") -# expect_true(file.exists(partxml.dir)) -# expect_equal(length(dir(partxml.dir)), 2) -# } -#}) - -test_that("Tear down the test", { - test.teardown() -}) diff --git a/tests/testthat/test_range_filter.r b/tests/testthat/test_range_filter.r index 09673f3..5c741db 100644 --- a/tests/testthat/test_range_filter.r +++ b/tests/testthat/test_range_filter.r @@ -14,3 +14,15 @@ test_that("from string expression to range functions", expect_error(inrange(1, "(10, 1)")) }) + + +test_that("range filters", +{ + tb$filter_range() + tb$filter_range() +}) + + +test_that("all filters", { + tb$clean() +}) diff --git a/tests/testthat/test_record2.r b/tests/testthat/test_record2.r index 3e88318..a1e0948 100644 --- a/tests/testthat/test_record2.r +++ b/tests/testthat/test_record2.r @@ -43,7 +43,7 @@ test_that("testing unique spell", { NIHR_HIC_ICU_0411="2000-01-03", NIHR_HIC_ICU_0412="2000-01-30")) - expect_equivalent(unique_spell(ccd)$spell, c(1, 1)) + expect_equivalent(ccd_unique_spell(ccd)$spell, c(1, 1)) ccd <- ccRecord() + new.episode(list(NIHR_HIC_ICU_0073="NHS1", NIHR_HIC_ICU_0411="2000-01-01", @@ -52,7 +52,7 @@ test_that("testing unique spell", { NIHR_HIC_ICU_0411="2000-01-04", NIHR_HIC_ICU_0412="2000-01-30")) - expect_equivalent(unique_spell(ccd)$spell, c(1, 1)) + expect_equivalent(ccd_unique_spell(ccd)$spell, c(1, 1)) # The second admission time earlier than the discharge time by mistake. @@ -63,6 +63,6 @@ test_that("testing unique spell", { NIHR_HIC_ICU_0411="2000-01-01", NIHR_HIC_ICU_0412="2000-01-30")) - expect_equivalent(unique_spell(ccd)$spell, c(1, 1)) + expect_equivalent(ccd_unique_spell(ccd)$spell, c(1, 1)) }) diff --git a/tests/testthat/test_report.r b/tests/testthat/test_report.r index 42c0f31..e088ee7 100644 --- a/tests/testthat/test_report.r +++ b/tests/testthat/test_report.r @@ -7,16 +7,16 @@ test_that("file level summary",{ test_that("table1", { - demg <- suppressWarnings(sql.demographic.table(ccd)) - table1(demg, "SEX", return.data=T) - expect_error(table1(demg, "non_nhic", return.data=T)) - expect_error(table1(demg, "h_rate", return.data=T)) # need to be categorical data + demg <- suppressWarnings(ccd_demographic_table(ccd)) + table1(demg, "SEX", return.data=TRUE) + expect_error(table1(demg, "non_nhic", return.data=TRUE)) + expect_error(table1(demg, "h_rate", return.data=TRUE)) # need to be categorical data }) test_that("demographic data completeness", { - demg <- suppressWarnings(sql.demographic.table(ccd)) - tb <- demographic.data.completeness(demg, return.data=T) + demg <- suppressWarnings(ccd_demographic_table(ccd)) + tb <- demographic.data.completeness(demg, return.data=TRUE) ndemg <- length(which(sapply(cleanEHR:::ITEM_REF, function(x) x$Classification1=="Demographic"))) expect_equal(nrow(tb), ndemg) @@ -24,14 +24,14 @@ test_that("demographic data completeness", { test_that("calculate total data point", { - expect_equal(total.data.point(ccd), 16) + expect_equal(total.data.point(ccd), 25) }) -test_that("episode graph", { +test_that("ccd_episode_graph", { }) test_that("calculate 2D sample rate", { - tb <- create.cctable(ccdt, conf=list(NIHR_HIC_ICU_0108=list()), freq=1) + tb <- create_cctable(ccdt, conf=list(NIHR_HIC_ICU_0108=list()), freq=1) capture.output(samplerate2d(tb$torigin)) }) diff --git a/tests/testthat/test_selectTable.r b/tests/testthat/test_selectTable.r index 9d0e5dd..fa2e885 100644 --- a/tests/testthat/test_selectTable.r +++ b/tests/testthat/test_selectTable.r @@ -1,6 +1,6 @@ context("Testing table selection") -pseudoepisode <- function(n, with.meta=F) { +pseudoepisode <- function(n, with.meta=FALSE) { ep <- list(NIHR_HIC_ICU_0108=data.frame(time=as.numeric(seq(0, 19)), item2d=rep(80, 20)), NIHR_HIC_ICU_0002="site_i", @@ -10,7 +10,7 @@ pseudoepisode <- function(n, with.meta=F) { ep[["NIHR_HIC_ICU_0441"]] <- data.frame(time=as.numeric(seq(0, 9)), item2d=seq(10), meta=rep("I", 10), - stringsAsFactors=F) + stringsAsFactors=FALSE) cr <- ccRecord() for (i in seq(n)) cr <- cr + new.episode(ep) @@ -27,15 +27,15 @@ test_that("",{ test_that("",{ - selectTable(ccdt, items_opt="NIHR_HIC_ICU_0108", freq=1) + ccd_select_table(ccdt, items_opt="NIHR_HIC_ICU_0108", freq=1) }) test_that("test if drugs meta data can be re-generated when the drug itself is not presented.", { # using meropenem 0441 as an example. - cr <- pseudoepisode(1, with.meta=F) - tb <- selectTable(cr, items_opt="NIHR_HIC_ICU_0441", freq=1) + cr <- pseudoepisode(1, with.meta=FALSE) + tb <- ccd_select_table(cr, items_opt="NIHR_HIC_ICU_0441", freq=1) expect_true("NIHR_HIC_ICU_0441" %in% names(tb)) expect_true("NIHR_HIC_ICU_0441.meta" %in% names(tb)) expect_equivalent(tb$NIHR_HIC_ICU_0441, rep(as.numeric(NA), 20)) @@ -45,8 +45,8 @@ test_that("test if drugs meta data can be re-generated when the drug itself test_that("when drug data is presented, to see whether it can be correctly loaded", { - cr <- pseudoepisode(1, with.meta=T) - tb <- selectTable(cr, items_opt="NIHR_HIC_ICU_0441", freq=1) + cr <- pseudoepisode(1, with.meta=TRUE) + tb <- ccd_select_table(cr, items_opt="NIHR_HIC_ICU_0441", freq=1) expect_true("NIHR_HIC_ICU_0441" %in% names(tb)) expect_true("NIHR_HIC_ICU_0441.meta" %in% names(tb)) expect_equivalent(tb$NIHR_HIC_ICU_0441, c(seq(10), rep(NA, 10))) diff --git a/tests/testthat/test_utilities.r b/tests/testthat/test_utilities.r index 55e28c5..3bd729f 100644 --- a/tests/testthat/test_utilities.r +++ b/tests/testthat/test_utilities.r @@ -26,8 +26,8 @@ test_that("ICNARC Conversion",{ }) test_that("extractIndex table", { - expect_true(class(extractIndexTable()) == "list") - expect_true(all(unique(unlist(extractIndexTable())) %in% + expect_true(class(extract_index_table()) == "list") + expect_true(all(unique(unlist(extract_index_table())) %in% c("item1d", "time","item2d", "meta"))) }) diff --git a/vignettes/cchic_overview.Rmd b/vignettes/cchic_overview.Rmd new file mode 100644 index 0000000..467a900 --- /dev/null +++ b/vignettes/cchic_overview.Rmd @@ -0,0 +1,97 @@ +--- +title: "Introduction to CCHIC critical care data" +author: David Perez Suarez & Sinan Shi +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Tour} + \usepackage[utf8]{inputenc} +--- + +CCHIC data strongly impacts the design of cleanEHR. Complex, heterogeneous, and +high resolution longitudinal data is a trend of EHR analysis, which takes +advantage of the ever more sophisticated statistical techniques and growing +computation capability. CCHIC is a representative example database of such +kind. It records 263 fields including 154 time-varying fields of patients +during their stay in intensive care units across five NHS trusts in England. +The recorded variables include patient demographics, time of admission and +discharge, survival status, diagnosis, physiology, laboratory, nursing and +drug. The latest database contains 22,628 admissions from 2014 to 2016, with +about 119 million data points (~6k per patient). + +The anonymised data subset can be obtained [here](https://form.jotformeu.com/drstevok/cchic-end-user-license---cleanEHR). +The selected number of researchers can get the access to the identifiable data +[UCL IDHS](http://www.ucl.ac.uk/isd/itforslms/services/handling-sens-data/tech-soln). + +### Episode and ICU stay timeline +We introduced the concept of 'episode' as the fundamental entity of EHRs, which +comprises all the data being recorded during the ICU stay. Each episode also +contains the demographic information of the patient, ward transferring origin +and destination within a hospital and diagnosis information. It allows us to +link episode data from a single patient across the entire multi-centre database. +The key dates and times are recorded as follow. + +```{r, message=FALSE, warning=FALSE} +library(cleanEHR) +data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData") +load(data.path) + +# Extract all non-longitudinal data (demographic, time, survival status, diagnosis) +dt <- ccd_demographic_table(ccd, dtype=TRUE) +``` +`ccd_demographic_table` function returns a table of all non-time-varying fields alongside with +several derived fields -- the fields that are not directly recorded in the original data. +Each row of the table is a unique admission, and every column is a non-time-varying data field. + +* `pid`: unique patient ID derived from NHS number or PAS number. +* `AGE`: date of birth - unit admission time +* `spell`: see *Spell* + +```{r} +print(dt[1:3, ]) +``` + +Data missing are caused by many reasons. We have to understand that in such a large database, +the data quality varies. In the anonymised dataset, data can be missing due to security reason. +Missing data are either NULL or "NA" depending on the field data type. + +### Demographic data +Every patient in England has a unique NHS number and PAS (Patient Admission +System) number. These can be used to identify a unique patient. Other +demographic information can also be found in the CCHIC dataset such as age, sex, +GP code, postcode and so on. Most of the demographic data will be removed, +pseudonimised, or modified in the anonymised dataset to protect the patient +privacy. + + +### Spell +Ward transferring within or beyond ICUs are counted as different episodes respectively. +In some research, one may be interested in looking at the sickness development and the treatment +history beyond each ICU stays. A spell includes number of episodes from a +unique patient which occured in a user defined period One can link episodes by +spell ID. +```{r} +head(ccd_unique_spell(ccd, duration=1)[, c("episode_id", "spell")]) +``` + +### Diagnosis data +Instead of using free text, we adopted [ICNARC diagnosis](https://www.icnarc.org/Our-Audit/Audits/Cmp/Resources/Icm-Icnarc-Coding-Method) +code system to record the diagnosis. The full ICNARC code is a five digit number separated +with dots. From left to right each digit represents a higher category of diagnosis. Due +to the privacy concerns, in the anonysmised dataset, last two digits will be removed. +One may use the function `icnarc2diagnosis` to look up the diagnosis code. +```{r} +icnarc2diagnosis("1.1") +icnarc2diagnosis("1.1.4") +icnarc2diagnosis("1.1.4.27.1") +``` + +### Longitudinal data +CCHIC measures 154 longitudinal data. The full list of longitudinal data are shown below. + + +We can also easily plot the data from a single selected admission. +```{r, fig.width=10, fig.height=11, out.width='700px', results='hide', message=FALSE, warning=FALSE} +plot(ccd@episodes[[7]], c("h_rate", "bilirubin", "fluid_balance_d")) +``` diff --git a/vignettes/data_clean.Rmd b/vignettes/data_clean.Rmd new file mode 100644 index 0000000..837687d --- /dev/null +++ b/vignettes/data_clean.Rmd @@ -0,0 +1,430 @@ +--- +title: "Data cleaning and wrangling with cleanEHR" +author: David Perez Suarez & Sinan Shi +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{data cleaning and wrangling with cleanEHR} + \usepackage[utf8]{inputenc} + +--- + + +# Preparation +### Load data +A sample RData can be found in `/doc/sample_ccd.RData`. +```{r} +library(cleanEHR) +data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData") +print(data.path) +load(data.path) +``` + + +### Inspect individual episode +There are 263 fields which covers patient demographics, physiology, laboratory, +and medication information. Each field has 2 labels, NHIC code and short name. +There is a function `lookup.items()` to look up the fields you need. +`lookup.items()` function is case insensitive and allows fuzzy search. +``` +# searching for heart rate +lookup.items('heart') # fuzzy search + ++-------------------+--------------+--------------+--------+-------------+ +| NHIC.Code | Short.Name | Long.Name | Unit | Data.type | ++===================+==============+==============+========+=============+ +| NIHR_HIC_ICU_0108 | h_rate | Heart rate | bpm | numeric | ++-------------------+--------------+--------------+--------+-------------+ +| NIHR_HIC_ICU_0109 | h_rhythm | Heart rhythm | N/A | list | ++-------------------+--------------+--------------+--------+-------------+ + +``` + + +# Non-longitudinal Data +`ccd_demographic_table()` can generate a `data.table` that contains all the +non-longitudinal variables. A demonstration of how to do some work on a subset +of data. +```{r, fig.width=10, fig.height=6, out.width='600px', results='hide', message=FALSE, warning=FALSE} +# contains all the 1D fields i.e. non-longitudinal +tb1 <- ccd_demographic_table(ccd) + +# filter out all dead patient. (All patients are dead in the dataset.) +tb1 <- tb1[DIS=="D"] + +# subset variables we want (ARSD = Advanced respiratory support days, +# apache_prob = APACHE II probability) +tb <- tb1[, c("SEX", "ARSD", "apache_prob"), with=FALSE] +tb <- tb[!is.na(apache_prob)] + +# plot +library(ggplot2) +ggplot(tb, aes(x=apache_prob, y=ARSD, color=SEX)) + geom_point() + +``` + + +# Longitudinal data +## Longitudinal table structure: `ccTable` + +To deal with longitudinal data, we need to first to transform it into a table format. +cleanEHR provides a refclass `ccTable`. There are several key components in the `ccTable` +structure. + +* `torigin`: the `ccRecord` dataset will be converted into a table format, where each row is a data point and each column is a field and pivoted by `time`, `site`, and `eipisode_id`. +* `tclean`: Same structure like the `torigin` but the values are modified with the cleaning process. +* filters: `filter_range`, `filter_categories`, `filter_nodata`, `filter_missingness`. +* imputation: filling missing data. + +### Create a `cctable` +First we need to prepare a simple configuration file. The first level item is +CCHIC code, see `lookup.items()`. We suggest users to write the short name and +long name (dataItem) to avoid confusion, though the both names will not be +taken into account in the process. We selected three items, heart rate +(longitudinal), Systolic arterial blood pressure (longitudinal), and sex +(non-longitudinal). +``` +NIHR_HIC_ICU_0108: + shortName: hrate +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +``` + +```{r, echo=FALSE} +# To prepare a YAML configuration file like this. You write the following text +# in a YAML file. +conf <- " +NIHR_HIC_ICU_0108: + shortName: hrate +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +" +library(yaml) +conf <- yaml.load(conf) +``` + +```{r} +# conf is the full path of the YAML configuration. +tb <- create_cctable(ccd, conf, freq=1) +print(tb$torigin) # the table +``` +In this table we can find the following columns, + +* time: number of hours from the unit admission. Since we set the `freq`=1, the cadence between rows is always 1 hour. +* site, episode_id: combine these two columns will give you a unique admission. +* fields: three selected fields. +* extra fields: depending on the variable we choose, some extra information are given. + + +### Get the mean heart rate of each patient. +```{r} +tb$tclean[, mean(NIHR_HIC_ICU_0108, na.rm=TRUE), by=c("site", "episode_id")] +``` + +## Data cleaning with `ccTable` +### Numerical range filter +The numerical range filter can only be applied on variables. +We envisaged three different cases for the numerical ranges -- values that are impossible, e.g. +negative heart rate; possible but unlikely, e.g. heart rate of 200; within a normal range. The +filter will label all these scenarios using "red", "amber", "green" respectively. The definition +of these ranges can be configured by users based on their judgement and the purpose of research. +Note, from "red" to "green", the next range must be a subset of the previous range. + +In the following section, we would like to apply a range filter to heart rate by modifying the previous +YAML configuration file. +```yaml +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + range: + labels: + red: (0, 300) # The following ranges are for demonstration purpose only. + amber: (11, 150] # open/close interval + green: (50, 100] + apply: drop_entry +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +``` + +```{r, echo=FALSE} +conf <- "NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + range: + labels: + red: (0, 300) + amber: (11, 150] + green: (50, 100] + apply: drop_entry +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex + category: + M: male + F: female + m: male + f: female +" +conf <- yaml.load(conf) +``` + +```{r} +tb <- create_cctable(ccd, conf, freq=1) +tb$filter_range("amber") # chose only the entry with amber +tb$apply_filters() # apply the filter to the clean table +``` +Now let's see the effect on the cleaned data `tclean` + +```{r, fig.width=12, fig.height=12, out.width='700px', results='hide', message=FALSE, warning=FALSE} +cptb <- rbind(cbind(tb$torigin, data="origin"), + cbind(tb$tclean, data="clean")) + +ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + + geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x") +``` + +In the case of changing the filter range from amber to green, +```{r} +#tb$reset() # reset the all the filters first. +tb$filter_range("green") +tb$apply_filters() +``` +```{r, fig.width=12, fig.height=12, out.width='700px', results='hide', message=FALSE, warning=FALSE} +cptb <- rbind(cbind(tb$torigin, data="origin"), + cbind(tb$tclean, data="clean")) + +ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + + geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x") +``` + + +### Categorical data filter +The purpose of categorical data filter is to remove the unexpected categorical data. +We can extend the previous configuration file as such, +``` +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex + category: + levels: + M: male + F: female + m: male + f: female + apply: drop_entry +``` +```{r, echo=FALSE} +conf <- "NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex + category: + levels: + M: male + F: female + m: male + f: female + apply: drop_entry +" +conf <- yaml.load(conf) + +# Try to modify the original data +tb$torigin$NIHR_HIC_ICU_0093[1] <- "ERROR" +``` + +```{r} +tb$reload_conf(conf) # change configuration file +tb$filter_categories() +tb$apply_filters() +``` +There is one error gender introduced in the sex field. After the filtering +process, the error entry is substitute by NA. +```{r} +unique(tb$torigin$NIHR_HIC_ICU_0093) +unique(tb$tclean$NIHR_HIC_ICU_0093) +``` + + +### Missingness filter +In some cases, we wish to exclude episodes where the data is too scarce. There are +three components in the missingness filter. In the following example, we arbitrarily +name the filter "daily". We gave 24 hours interval and 70% accepting rate. It is to say +in any 24 hours interval, if the heart rate missing rate is higher than 30%, we will +exclude the entire episode. Note, the unit `labels: daily: 24` number of rows instead of +hours. It represents 24 hours because the cadence of the `ccTable` is 1 hour. + +``` +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + missingness: + labels: + daily: 24 + accept_2d: + daily: 70 + apply: drop_episode +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +``` + +```{r, echo=FALSE} +conf <- "NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + missingness: + labels: + daily: 24 + accept_2d: + daily: 70 + apply: drop_episode +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +" +conf <- yaml.load(conf) +``` + +```{r} +tb$reload_conf(conf) # change configuration file +tb$filter_missingness() +tb$apply_filters() +``` + +```{r} +# episodes in the original data table +unique(paste(tb$torigin$site, tb$torigin$episode_id)) +# episodes in the cleaned data table +unique(paste(tb$tclean$site, tb$tclean$episode_id)) +``` + +### Nodata filter +Similarly, we can setup the no data filter as following. Here it means, +drop the entire episode if no hear rate data is found. + +``` +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + no_data: + apply: drop_episode +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex +``` + +### Run all filters together +To wrap up, we can put all the above stated filter configurations together in the +YAML file and run the filter together. +``` +NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + range: + labels: + red: (0, 300) + amber: (11, 150] + green: (50, 100] + apply: drop_entry + missingness: + labels: + daily: 24 + accept_2d: + daily: 70 + apply: drop_episode + nodata: + apply: drop_episode +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex + category: + levels: + M: male + F: female + m: male + f: female + apply: drop_entry +``` + +```{r, echo=FALSE} +conf <- "NIHR_HIC_ICU_0108: + shortName: h_rate + dataItem: Heart rate + range: + labels: + red: (0, 300) + amber: (11, 150] + green: (50, 100] + apply: drop_entry + missingness: + labels: + daily: 24 + accept_2d: + daily: 70 + apply: drop_episode + nodata: + apply: drop_episode +NIHR_HIC_ICU_0112: + shortName: bp_sys_a + dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure +NIHR_HIC_ICU_0093: + shortName: sex + category: + levels: + M: male + F: female + m: male + f: female + apply: drop_entry +" +conf <- yaml.load(conf) +``` + + +```{r} +# Method 1 +tb <- create_cctable(ccd, conf, freq=1) +tb$filter_range("amber") +tb$filter_missingness() +tb$filter_nodata() +tb$filter_categories() +tb$apply_filters() + +tb$reset() # reset + +# Method 2 +#tb$clean() +``` + diff --git a/vignettes/graphs/cchic_timeline.png b/vignettes/graphs/cchic_timeline.png new file mode 100644 index 0000000..38e4bc2 Binary files /dev/null and b/vignettes/graphs/cchic_timeline.png differ diff --git a/vignettes/graphs/item_ref.png b/vignettes/graphs/item_ref.png new file mode 100644 index 0000000..8a4ac19 Binary files /dev/null and b/vignettes/graphs/item_ref.png differ diff --git a/vignettes/tour.Rmd b/vignettes/tour.Rmd deleted file mode 100644 index f1e8eef..0000000 --- a/vignettes/tour.Rmd +++ /dev/null @@ -1,156 +0,0 @@ ---- -title: "A brief tour of cleanEHR" -author: David Perez Suarez & Sinan Shi -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Tour} - \usepackage[utf8]{inputenc} ---- - - -### Load data -Usually a RData file which stores all the dataset will be given. A sample RData -can be found in `/doc/sample_ccd.RData`. -```{r} -library(cleanEHR) -data.path <- paste0(find.package("cleanEHR"), "/doc/sample_ccd.RData") -load(data.path) -``` -### Data overview -You can have a quick overview of the data by checking `infotb`. In the sample -dataset, sensitive variables such as NHS number and admission time have been -removed or twisted. -```{r} -print(head(ccd@infotb)) -``` -The basic entry of the data is episode which indicates an admission of a site. -Using `episode_id` and `site_id` can locate a unique admission entry. `pid` is -a unique patient identifier. - -```{r} -# quickly check how many episodes are there in the dataset. -ccd@nepisodes -``` - -There are 263 fields which covers patient demographics, physiology, laboratory, -and medication information. Each field has 2 labels, NHIC code and short name. -There is a function `lookup.items()` to look up the fields you need. -`lookup.items()` function is case insensitive and allows fuzzy search. -``` -# searching for heart rate -lookup.items('heart') # fuzzy search - -+-------------------+--------------+--------------+--------+-------------+ -| NHIC.Code | Short.Name | Long.Name | Unit | Data.type | -+===================+==============+==============+========+=============+ -| NIHR_HIC_ICU_0108 | h_rate | Heart rate | bpm | numeric | -+-------------------+--------------+--------------+--------+-------------+ -| NIHR_HIC_ICU_0109 | h_rhythm | Heart rhythm | N/A | list | -+-------------------+--------------+--------------+--------+-------------+ - -``` - -### Inspect individual episode -```{r, fig.width=10, fig.height=11, out.width='700px', results='hide', message=FALSE, warning=FALSE} -# check the heart rate, bilirubin, fluid balance, and drugs of episode_id = 7. -# NOTE: due to anonymisation reason, some episodes data cannot be displayed -# properly. -episode.graph(ccd, 7, c("h_rate", "bilirubin", "fluid_balance_d")) -``` - -## Non-longitudinal Data -`sql.demographic.table()` can generate a `data.table` that contains all the -non-longitudinal variables. A demonstration of how to do some work on a subset -of data. -```{r, fig.width=10, fig.height=6, out.width='700px', results='hide', message=FALSE, warning=FALSE} -# contains all the 1D fields i.e. non-longitudinal -tb1 <- sql.demographic.table(ccd) - -# filter out all dead patient. (All patients are dead in the dataset.) -tb1 <- tb1[DIS=="D"] - -# subset variables we want (ARSD = Advanced respiratory support days, -# apache_prob = APACHE II probability) -tb <- tb1[, c("SEX", "ARSD", "apache_prob"), with=F] -tb <- tb[!is.na(apache_prob)] - -# plot -library(ggplot2) -ggplot(tb, aes(x=apache_prob, y=ARSD, color=SEX)) + geom_point() - -``` - -## Longitudinal data -To deal with longitudinal data, we need to first to transform it into a long -table format. - -### Create a `cctable` -```{r} -# To prepare a YAML configuration file like this. You write the following text -# in a YAML file. -conf <- " -NIHR_HIC_ICU_0108: - shortName: hrate -NIHR_HIC_ICU_0112: - shortName: bp_sys_a - dataItem: Systolic Arterial blood pressure - Art BPSystolic Arterial blood pressure -NIHR_HIC_ICU_0093: - shortName: sex -" -library(yaml) -tb <- create.cctable(ccd, yaml.load(conf), freq=1) - -# a lazy way to do that. -tb <- create.cctable(ccd, list(NIHR_HIC_ICU_0108=list(), - NIHR_HIC_ICU_0112=list(), - NIHR_HIC_ICU_0093=list()), - freq=1) -print(tb$tclean) -``` - -### Manipulate on `cctable` -* Get the mean heart rate of each patient. -```{r} -tb$tclean[, mean(NIHR_HIC_ICU_0108, na.rm=T), by=c("site", "episode_id")] -``` - - -### Data cleaning -To clean the data, one needs to write the specification in the YAML -configuration file. - -```{r, fig.width=12, fig.height=12, out.width='700px', results='hide', message=FALSE, warning=FALSE} -conf <-" -NIHR_HIC_ICU_0108: - shortName: hrate - dataItem: Heart rate - distribution: normal - decimal_places: 0 - range: - labels: - red: (0, 300) - amber: (11, 150) - apply: drop_entry - missingness: # remove episode if missingness is higher than 70% in any 24 hours interval - labels: - yellow: 24 - accept_2d: - yellow: 70 - apply: drop_episode -" - -ctb <- create.cctable(ccd, yaml.load(conf), freq=1) -ctb$filter.ranges("amber") # apply range filters -ctb$filter.missingness() -ctb$apply.filters() - -cptb <- rbind(cbind(ctb$torigin, data="origin"), - cbind(ctb$tclean, data="clean")) - - -ggplot(cptb, aes(x=time, y=NIHR_HIC_ICU_0108, color=data)) + - geom_point(size=1.5) + facet_wrap(~episode_id, scales="free_x") - -```