transform script now creates loader files

added README for description
CBIIT · Feb 5, 2024 · 6bf435b · 6bf435b
1 parent c32c0ce
commit 6bf435b
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 6 deletions.
diff --git a/model-desc/load/README.md b/model-desc/load/README.md
@@ -0,0 +1,18 @@
+# Cancer Moonshot Biobank to CTDC Transformer
+
+v0.1 (05 Feb 2024)
+
+
+The file (cmb-transform-ctdc.r)[./cmb-transform-ctdc.r] is an R script that uses the mapping spreadsheet (cmb-dbgap-to-ctdc-mapping.v02-05-24.xlsx)[./cmb-dbgap-to-ctdc-mapping.v02-05-24.xlsx] to transform Biobank data in dbGaP submission spreadsheets to CSV (.txt) files suitable for the Bento data loader.
+
+The transformation as of v0.1 performs the following actions:
+
+* Converts source (Biobank dbGaP) file column names to target (CTDC model) Property names;
+* Copies the data values for the desired columns (given in the mapping file) to the appropriate records in the loader files;
+* Regroups the desired columns from source data under desired Node in the target model, by creating per-node CSVs, named after the Nodes;
+* Correctly maintains the subject id and specimen id relationships with the data records;
+* Creates unique random IDs for each record in the CSVs (e.g., the values for "diagnosis\_id", "treatment\_id" and so on.).
+
+The data values themselves are not yet transformed to CTDC model values; this is a planned task that requires data mappings.
+
+
diff --git a/model-desc/load/cmb-transform-ctdc.r b/model-desc/load/cmb-transform-ctdc.r
@@ -5,7 +5,7 @@ filter <- dplyr::filter
 datadir <- "data"
 map_file  <- "cmb-dbgap-to-ctdc-mapping.v10-26-23.xlsx"
 map_sheet  <- "Sheet1"
-files <- grep("xlsx", list.files(datadir), value=T)
+files <- grep("^[^~].*xlsx", list.files(datadir), value=T)
 
 ## actual column names in dbGaP data files:
 fields_by_file <- tibble(files %>%
@@ -179,9 +179,11 @@ for (nd in maps$`CTDC Destination Node` %>% unique) {
             tb  <- ctdc_spec %>% left_join(ctdc_subj, by=c("subject.subject_id")) %>%
                       mutate( type = nd ) %>% 
                       unique
-            tb  <- tb %>% mutate(sid = if_else(is.na(specimen_id),
-                                               dum_ids(specimen_id),
-                                               specimen_id)) %>%
+            # dummy specimen_ids for NAs
+            tb  <- tb %>% mutate(sid = map_chr(specimen_id,
+                                               function (x) if_else(is.na(x),
+                                               dum_ids(x),
+                                               x))) %>%
                 select(-specimen_id) %>% rename(specimen_id = sid)
             write_tsv(tb,
                       paste(nd,"txt",sep="."), na="")                      
@@ -191,15 +193,31 @@ for (nd in maps$`CTDC Destination Node` %>% unique) {
         tb  <- ctdc_subj %>% mutate(type = nd)
         if (nd != "subject") {
             tb  <- tb %>% mutate(idname = new_ids(type))
-            names(tb)[length(tb)] <- paste(nd,"id",sep="_")
+            nd_id  <- nd
+            if (nd == "surgery") {nd_id <- "surgical_procedure"}
+            if (nd == "radiotherapy") {nd_id  <- "radiological_procedure"}
+            names(tb)[length(tb)] <- paste(nd_id,"id",sep="_")
             if (nd == "subject_status") {
                 tb  <- tb %>% filter( !(is.na(survival_status) & is.na(primary_cause_of_death)) )
             }
         }
         write_tsv(tb, paste(nd,"txt",sep="."),na="")
     }
     else if(!is.null(ctdc_spec)) {
-        if (nd != "subject") {
+        if (nd == "specimen") {
+            tb  <- ctdc_spec %>% 
+                      mutate( type = nd ) %>% 
+                      unique
+            # dummy specimen_ids for NAs
+            tb  <- tb %>% mutate(sid = map_chr(specimen_id,
+                                               function (x) if_else(is.na(x),
+                                               dum_ids(x),
+                                               x))) %>%
+                select(-specimen_id) %>% rename(specimen_id = sid)
+            write_tsv(tb,
+                      paste(nd,"txt",sep="."), na="")                      
+        }
+        else {
             tb  <- tb %>% mutate(idname = new_ids(type))
             names(tb)[length(tb)] <- paste(nd,"id",sep="_")
         }