Skip to content

Commit

Permalink
Merge branch 'initial_cmb_rebuild_of_ctdc_data_model'
Browse files Browse the repository at this point in the history
  • Loading branch information
majensen committed Feb 5, 2024
2 parents f824d1d + 6bf435b commit 1469580
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 34 deletions.
18 changes: 18 additions & 0 deletions model-desc/load/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Cancer Moonshot Biobank to CTDC Transformer

v0.1 (05 Feb 2024)


The file (cmb-transform-ctdc.r)[./cmb-transform-ctdc.r] is an R script that uses the mapping spreadsheet (cmb-dbgap-to-ctdc-mapping.v02-05-24.xlsx)[./cmb-dbgap-to-ctdc-mapping.v02-05-24.xlsx] to transform Biobank data in dbGaP submission spreadsheets to CSV (.txt) files suitable for the Bento data loader.

The transformation as of v0.1 performs the following actions:

* Converts source (Biobank dbGaP) file column names to target (CTDC model) Property names;
* Copies the data values for the desired columns (given in the mapping file) to the appropriate records in the loader files;
* Regroups the desired columns from source data under desired Node in the target model, by creating per-node CSVs, named after the Nodes;
* Correctly maintains the subject id and specimen id relationships with the data records;
* Creates unique random IDs for each record in the CSVs (e.g., the values for "diagnosis\_id", "treatment\_id" and so on.).

The data values themselves are not yet transformed to CTDC model values; this is a planned task that requires data mappings.


151 changes: 117 additions & 34 deletions model-desc/load/cmb-transform-ctdc.r
Original file line number Diff line number Diff line change
@@ -1,21 +1,58 @@
library(tidyverse)
library(readxl)


filter <- dplyr::filter
datadir <- "data"
files <- grep("xlsx", list.files(datadir), value=T)
map_file <- "cmb-dbgap-to-ctdc-mapping.v10-26-23.xlsx"
map_sheet <- "Sheet1"
files <- grep("^[^~].*xlsx", list.files(datadir), value=T)

## actual column names in dbGaP data files:
fields_by_file <- tibble(files %>%
map_dfr(
function (f) {
xx<-read_excel(paste(datadir,f,sep="/"),col_names=F);
data.frame(file = rep(f,length(xx[1,])),
field = as.vector(xx[1,],mode="character"))}))

cmb_subject <- NULL
cmb_specimen <- NULL
ctcd_data <- NULL

## Read submitted CMB data

maps <- read_excel("cmb-dbgap-to-ctdc-mapping.maj.xlsx",sheet="prop maps") %>%
filter(`dbGaP code` != '3B')
maps <- read_excel(map_file,sheet=map_sheet) %>%
filter(!is.na(`CMB Source Data File`)) %>%
filter(!is.na(`CMB Source Field`)) %>%
mutate( `dbGaP code` = str_to_upper( str_sub(`CMB Source Data File`,1,2) ) )

## Transform CMB vocabulary to CTDC model vocabulary
not_processed <- maps %>% anti_join(fields_by_file, by=c("CMB Source Data File" = "file",
"CMB Source Field" = "field"))
maps <- maps %>% semi_join(fields_by_file, by=c("CMB Source Data File" = "file",
"CMB Source Field" = "field"))
if (length(not_processed) > 0) {
warning("The above fields in the map file are not present in the data files.\nThey will not be processed.")
not_processed %>% select(`CMB Source Data File`,`CMB Source Field`) %>% print(n=Inf)
}

## make duplicated field names among input files unique (add "qualified_source_field" coln)

for (m in maps$`dbGaP file` %>% unique) {
dup_fields <- (maps %>% group_by(`CMB Source Field`) %>% summarize( ct = n() ) %>% arrange( desc(ct) ) %>% filter(ct > 1) %>% select(`CMB Source Field`))$`CMB Source Field`

maps <- maps %>%
mutate(qualified_source_field = map2_chr(`CMB Source Field`, `CMB Source Data File`,
function(x,y)
if_else(
x %in% dup_fields,
str_c(x,".", str_match(y,".*_([^_.]*)")[2]),
x)) )



## Transform CMB vocabulary to CTDC model vocabulary
cmb_subject <- NULL
cmb_specimen <- NULL
for (m in maps$`CMB Source Data File` %>% unique) {
fn <- grep(m, files, value=T, fixed=T)
if (length(fn) == 0) {
next
Expand All @@ -24,16 +61,26 @@ for (m in maps$`dbGaP file` %>% unique) {
print(fn)
if (!is.na(file.info(fn)$isdir) && !file.info(fn)$isdir ) {
dta <- read_excel(fn)
dbcode <- as.character( maps %>% dplyr::filter( `dbGaP file` == m ) %>%
# why is the below line so complicated?
dbcode <- as.character( maps %>% dplyr::filter( `CMB Source Data File` == m ) %>%
select(`dbGaP code`) %>% group_by(`dbGaP code`) %>% summarize)
if (dbcode == '5B') {
# 5B tables - dplyr::filter by desired VARNAMEs and join on SUBJECT_ID
if (dbcode == '5A') {
# 5A tables - dplyr::filter by desired VARNAMEs and join on SUBJECT_ID
dta <- dta %>%
select(
c('SUBJECT_ID',
(maps %>% dplyr::filter( `dbGaP file` == m) %>%
select(VARNAME))$VARNAME))
print(dta)
(maps %>% dplyr::filter( `CMB Source Data File` == m) %>%
select(`CMB Source Field`))$`CMB Source Field`)) %>%
rename_with(
function (x) map_chr(x, function(y) {
return(as.character(
maps %>%
filter(`CMB Source Data File` == m) %>%
filter(`CMB Source Field` == y) %>%
select(qualified_source_field)))
}),
!SUBJECT_ID
)
if (is.null(cmb_subject)) {
cmb_subject <- dta
}
Expand All @@ -42,15 +89,24 @@ for (m in maps$`dbGaP file` %>% unique) {
cmb_subject %>% full_join(dta, by = c('SUBJECT_ID'))
}
}
else if (dbcode == '6B') {
## 6B tables - dplyr::filter by desired VARNAMEs and join on SUBJECT_ID and
else if (dbcode == '6A') {
## 6A tables - dplyr::filter by desired VARNAMEs and join on SUBJECT_ID and
## SAMPLE_ID
dta <- dta %>%
select(
c('SUBJECT_ID', 'SAMPLE_ID',
(maps %>% dplyr::filter( `dbGaP file` == m) %>%
select(VARNAME))$VARNAME))
print(dta)
(maps %>% dplyr::filter( `CMB Source Data File` == m) %>%
select(`CMB Source Field`))$`CMB Source Field`)) %>%
rename_with(
function (x) map_chr(x, function(y) {
return(as.character(
maps %>%
filter(`CMB Source Data File` == m) %>%
filter(`CMB Source Field` == y) %>%
select(qualified_source_field)))
}),
!ends_with("_ID")
)
if (is.null(cmb_specimen)) {
cmb_specimen <- dta
}
Expand All @@ -76,47 +132,58 @@ dum_ids <- function (x) paste("dum",round(runif(n=length(x),min=1,max=1000000))

## Write loader-ready tab-separated value files

for (nd in maps$`CTDC node` %>% unique) {
for (nd in maps$`CTDC Destination Node` %>% unique) {
ctdc_subj <- NULL
ctdc_spec <- NULL
subj_set <- maps %>% filter( `CTDC node` == nd & `dbGaP code` == '5B' ) %>%
add_row(VARNAME="SUBJECT_ID",`CTDC node`=nd,`CTDC property`="subject.subject_id")
samp_set <- maps %>% filter( `CTDC node` == nd & `dbGaP code` == '6B' ) %>%
add_row(VARNAME="SUBJECT_ID",`CTDC node`=nd,`CTDC property`="subject.subject_id")
subj_set <- maps %>% filter( `CTDC Destination Node` == nd & `dbGaP code` == '5A' ) %>%
add_row(`CMB Source Field`="SUBJECT_ID",
qualified_source_field="SUBJECT_ID",
`CTDC Destination Node`=nd,
`CTDC Destination Property`="subject.subject_id")
samp_set <- maps %>% filter( `CTDC Destination Node` == nd & `dbGaP code` == '6A' ) %>%
add_row(`CMB Source Field`="SUBJECT_ID",
qualified_source_field="SUBJECT_ID",
`CTDC Destination Node`=nd,
`CTDC Destination Property`="subject.subject_id")
if (nrow(subj_set)>1) {
ctdc_subj <- cmb_subject %>% select( subj_set$VARNAME ) %>%
ctdc_subj <- cmb_subject %>% select( subj_set$qualified_source_field ) %>%
rename_with(
function (x) map_chr(x, function(y) {
return(as.character(
subj_set %>%
filter(VARNAME == y) %>% select(`CTDC property`)))
filter(qualified_source_field == y) %>%
select(`CTDC Destination Property`)))
})
) %>% unique
}
if (nrow(samp_set)>1) {
ctdc_spec <- cmb_specimen %>% select( samp_set$VARNAME) %>%
ctdc_spec <- cmb_specimen %>% select( samp_set$qualified_source_field) %>%
rename_with(
function (x) map_chr(x, function(y) {
return(as.character(
samp_set %>%
filter(VARNAME == y) %>% select(`CTDC property`)))
filter(qualified_source_field == y) %>%
select(`CTDC Destination Property`)))
})
) %>% unique
}
if (!is.null(ctdc_subj) & !is.null(ctdc_spec)) {
if (nd == "diagnosis") {
write_tsv(ctdc_subj %>% left_join(ctdc_spec, by=c("subject.subject_id")) %>%
mutate(type = nd) %>% mutate( diagnosis_id = new_ids(type) ) %>%
unique,
write_tsv(ctdc_subj %>%
left_join(ctdc_spec, by=c("subject.subject_id")) %>%
mutate(type = nd) %>%
mutate( diagnosis_id = new_ids(type) ) %>% unique,
paste(nd,"txt",sep="."),na="")
}
else if (nd == "specimen") {
tb <- ctdc_spec %>% left_join(ctdc_subj, by=c("subject.subject_id")) %>%
mutate( type = nd ) %>%
unique
tb <- tb %>% mutate(sid = if_else(is.na(specimen_id),
dum_ids(specimen_id),
specimen_id)) %>%
# dummy specimen_ids for NAs
tb <- tb %>% mutate(sid = map_chr(specimen_id,
function (x) if_else(is.na(x),
dum_ids(x),
x))) %>%
select(-specimen_id) %>% rename(specimen_id = sid)
write_tsv(tb,
paste(nd,"txt",sep="."), na="")
Expand All @@ -126,15 +193,31 @@ for (nd in maps$`CTDC node` %>% unique) {
tb <- ctdc_subj %>% mutate(type = nd)
if (nd != "subject") {
tb <- tb %>% mutate(idname = new_ids(type))
names(tb)[length(tb)] <- paste(nd,"id",sep="_")
nd_id <- nd
if (nd == "surgery") {nd_id <- "surgical_procedure"}
if (nd == "radiotherapy") {nd_id <- "radiological_procedure"}
names(tb)[length(tb)] <- paste(nd_id,"id",sep="_")
if (nd == "subject_status") {
tb <- tb %>% filter( !(is.na(survival_status) & is.na(primary_cause_of_death)) )
}
}
write_tsv(tb, paste(nd,"txt",sep="."),na="")
}
else if(!is.null(ctdc_spec)) {
if (nd != "subject") {
if (nd == "specimen") {
tb <- ctdc_spec %>%
mutate( type = nd ) %>%
unique
# dummy specimen_ids for NAs
tb <- tb %>% mutate(sid = map_chr(specimen_id,
function (x) if_else(is.na(x),
dum_ids(x),
x))) %>%
select(-specimen_id) %>% rename(specimen_id = sid)
write_tsv(tb,
paste(nd,"txt",sep="."), na="")
}
else {
tb <- tb %>% mutate(idname = new_ids(type))
names(tb)[length(tb)] <- paste(nd,"id",sep="_")
}
Expand Down

0 comments on commit 1469580

Please sign in to comment.