merge with master

ropensci-archive · Jun 30, 2016 · 515050d · 515050d
2 parents 1cc6ada + 962832d
commit 515050d
Show file tree

Hide file tree

Showing 10 changed files with 174 additions and 43 deletions.
diff --git a/R/range.filter.R b/R/range.filter.R
@@ -49,7 +49,7 @@ ccTable$methods(
     filter.ranges = function(select='red') {
         rgnum <- list('red'=1, 'amber'=2, 'green'=3)
         inselectrange <- function(x, ...) {
-            x > rgnum[[select]]
+            x >= rgnum[[select]]
         }
 
         if(is.null(.self$dquality$range) || 

diff --git a/R/xml2ccdata.r b/R/xml2ccdata.r
@@ -15,18 +15,32 @@ getXmlepisode <- function(xml.root, id) {
     xml.root[[1]][[2]][[id]]
 }
 
+#' Extract the original file name from a path and file removing
+#' all the suffixes.
+#' @param pathfile a particular file name which may have a suffix
+#' @param removestr last bit from the original filename
+#' @return string
+#' @export extract_file_origin
+extract_file_origin <- function(pathfile, removestr='.xml'){
+  split_path <- unlist(strsplit(pathfile, "/"))
+  filename <- split_path[length(split_path)]
+  original <- unlist(strsplit(filename, removestr))
+  return(paste(original[1], removestr, sep=""))
+  }
+
+
 #' convert xml data to ccdata format
 #' @param file xml file name
 #' @return ccdata 
 #' @export xml2Data
 xml2Data <- function (file, select.episode=NULL, quiet=TRUE, xml=NULL,
                       file_origin="NA", parse_time=Sys.time()){
-    if (is.null(xml)) {
-        split_file_name <- unlist(strsplit(file, "/"))
-        file_origin <- split_file_name[length(split_file_name)]
-
-        xml <- xmlLoad(file)
+  if (is.null(xml)) {
+    if (file_origin == "NA") {
+      file_origin <- extract_file_origin(file)
     }
+    xml <- xmlLoad(file)
+  }
 
     episode.num <- xmlSize(xml[[1]][[2]])
     if(is.null(select.episode))

diff --git a/example/clean_data.r b/example/clean_data.r
@@ -5,8 +5,7 @@ if (!exists("ccd_delta_num"))
     load('../data/delta_num.Rdata')
 ccd <- ccd_delta_num
 
-dt.sofa <- ccTable(conf=yaml.load_file('tests/data/test_sofa.yml'), record=ccd)
-# create table with all selected items in yaml conf with cadance of 1 hour.
+dt.sofa <- ccTable(conf=yaml.load_file('tests/data/ANALYSIS_REF.yaml'), record=ccd)
 dt.sofa$create.table(freq=1)
 dt.sofa$filter.ranges()
 dt.sofa$filter.category()

diff --git a/example/pipeline/break_into.sh b/example/pipeline/break_into.sh
@@ -7,36 +7,68 @@ then
     exit 1
 fi
 
+default_ext='partxml'
 
-# change the end subject for something different - <tataa>
-sed -e 's|</d:subject>|<tataaa>|' $1 > $1.tmp
+# <d:xx> or <xx> file?
+dchar="d:"
+dchar_exist=$(head -2 $1 | grep -c "<${dchar}")
+if [ ${dchar_exist} -eq 0 ]
+then
+    dchar=""
+fi
+
+subject="${dchar}subject"
+
+# change the end subject for something different - <cut_here>
+# so it is not counted in the awk below.
+sed -e 's|</'"${subject}"'>|<cut_here>\n|' ${1} > ${1}.tmp
 
-# Break the file into chuncks where <d:subject> occurs.
-awk '/\<d:subject\>/ { delim++ } {file = sprintf("chunks_%s.txt", int(delim/'$2')); print >> file; }' $1.tmp
+# Break the file into chunks where <${subject}> occurs.
+# Each time <subject> is found, delim will increase
+#   if delim/maxpatients (2nd argument) == 1 then
+#   create a new file.
+# initialising delim as -1 so the first file includes the
+# number of subjects asked.
+awk 'BEGIN {delim=-1} \
+         /\<'"${subject}"'\>/ { delim++ } \
+                  {file = sprintf("'${1}'_%s.'${default_ext}'", int(delim/'${2}'));\
+                   print >> file; } \
+     END { print "'${1}' has ", delim+1, "subjects"}' ${1}.tmp
 
-firstline=$(head -n1 chunks_0.txt)
-lastline="</d:data></d:context></d:document>"
-nfiles=$(expr `ls chunks_* | wc -l` - 1)
+
+# extract the header of the file with its meta.
+# - extract everything till the first <subject> (what's used to separate the file)
+# - remove the instance for <subject> so it's not repeated when inserted.
+# - remove all no printing characters - it seems there's one making the insertion to
+#   fail afterwards.
+# head won't work because some files run over multiple lines
+firstlines=$(sed -n '1,/<'"${subject}"'>/p' ${1}_0.${default_ext} | \
+                    sed 's/<'"${subject}"'>//' | tr -dc '[:print:]')
+
+lastline="</${dchar}data></${dchar}context></${dchar}document>"
+nfiles=$(ls "${1}"_* | wc -l)
 
 # loop over all the files to add header and footer for each file that needs it
-for i in $(seq 0 ${nfiles})
+for ((i=0; i<${nfiles}; i++))
 do
-    echo chunks_${i}
+    output=${1}_${i}.${default_ext}
+    # replace the label changed before
+    sed -i 's|<cut_here>|</'"${subject}"'>|' ${output}
 
-    sed -i 's|<tataaa>|</d:subject>|' chunks_${i}.txt
-
-    if [ $i -lt ${nfiles} ]
+    # add footer to the files
+    if [ $i -lt $((${nfiles} - 1)) ]
     then
-        echo "$lastline" >> chunks_${i}.txt
+        echo "$lastline" >> ${output}
     fi
 
-
+    # add header to the files
     if [ $i -gt 0 ]
     then
-        sed -i '1s|^|'"$firstline"'|' chunks_${i}.txt
+        sed -i '1s|^|'"${firstlines}"'|' ${output}
     fi
 
 done
 
-rm $1.tmp
+# Remove the temporary file used
+rm ${1}.tmp
 
diff --git a/example/pipeline/combine_data.r b/example/pipeline/combine_data.r
@@ -1,8 +1,10 @@
 #!/usr/bin/Rscript
+
 library(ccdata)
 
+args = commandArgs(trailingOnly=TRUE)
+
 institute <- c("CUH", "Oxford", "GSTT", "imperial", "UCLH")
-#institute <- "imperial"
 
 data<-dir(institute, full.name=TRUE)[grep(".RData", dir(institute,
                                                         full.name=TRUE))]
@@ -16,4 +18,4 @@ for (d in data) {
 
 ccd <- new
 
-save(ccd, file="all_patients.Rdata")
+save(ccd, file=args[1])
diff --git a/example/pipeline/extract_data.r b/example/pipeline/extract_data.r
@@ -3,15 +3,7 @@
 library(ccdata)
 
 args = commandArgs(trailingOnly=TRUE)
-r <- xmlLoad(args[1])
-npatient <- length(names(r[[1]][[2]]))
 
-step_iterator <- seq(1, npatient, by=100)
-steps <- c(step_iterator, npatient)
-
-
-for (i in seq(step_iterator)) {
-    cat("extract from", c(steps[i], "to", steps[i+1]-1, "\n"))
-    ccd <- xml2Data(xml=r, file_origin=args[1], select.episode=seq(steps[i], steps[i+1]-1), quiet=FALSE)
-    save(ccd, file=paste(args[1], steps[i], steps[i+1]-1, ".RData", sep="_"))
-}
+cat("extract patients from", c(args[1], "\n"))
+ccd <- xml2Data(args[1])
+save(ccd, file=paste(args[1], ".RData", sep="_"))
diff --git a/example/pipeline/pipeline.sh b/example/pipeline/pipeline.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Pipeline code to generate the ccdata structure
+
+
+DEFAULT_SPACES='xml'
+allPatients='all_patients.RData'
+allPatients_untime='delta_num.RData'
+
+function remove_spaces (){
+    mv "${1}" "${1// /_}"
+}
+# make the function available as a command so find can use it.
+export -f remove_spaces
+
+function remove_spaces_ext (){
+    if [ -z "$1" ]
+    then
+        echo "-- removing spaces on \"${DEFAULT_SPACES}\" files -- "
+    else
+        echo "-- removing spaces on \"${1}\" files -- "
+    fi
+
+    extension=${1-$DEFAULT_SPACES}
+
+    FILES_spaced_num=$(find ./ -type f -iname '* *.'"${extension}" | wc -l)
+    find ./ -type f -iname '*\ *.'"${extension}" -exec bash -c 'remove_spaces "{}"' \;
+    # if you try to do this as a loop read:
+    #  http://mywiki.wooledge.org/BashPitfalls#for_i_in_.24.28ls_.2A.mp3.29
+
+    echo "-- converted ${FILES_spaced_num} files --"
+}
+
+
+#============================================================
+# remove spaces from file names, otherwise xargs won't parallelise
+#============================================================
+remove_spaces_ext
+
+#============================================================
+# Break files into smaller chunks in parallel
+#  - find the files, sort them by size (%k)
+#  - extract the file names and run break_into 4 at a time
+#  => filename.xml_xx.xml; where xx is a non padded number
+#============================================================
+find ./ -type f -iname '*.xml' -printf "%k %p\n" | sort -nr \
+    | awk '{print $2}' | xargs -n1 -P 4 -I % ./break_into.sh % 3
+
+#============================================================
+# Convert each portion to ccdata
+#============================================================
+find ./ -type f -iname '*.partxml' | xargs -n1 -P 4 ./extract_data.r
+
+#============================================================
+# Combine all the files
+#============================================================
+./combine_data.r ${allPatients}
+
+#============================================================
+# Anonymise data removing timestamp
+#============================================================
+./untimeit.r ${allPatients} ${allPatients_untime}
+
+echo "Files ${allPatients} and ${allPatients_untime} created."
diff --git a/example/pipeline/pipline.r b/example/pipeline/pipline.r
diff --git a/example/pipeline/untimeit.r b/example/pipeline/untimeit.r
@@ -0,0 +1,12 @@
+#!/usr/bin/Rscript
+
+library(ccdata)
+
+args = commandArgs(trailingOnly=TRUE)
+
+load(args[1])
+ccd <- reindexRecord(ccd)
+ccd <- deltaTime(ccd, anonymised=T)
+ccd <- uniquePatients(ccd)
+ccd_delta_num <- ccd
+save(ccd_delta_num, file=args[2])
diff --git a/tests/testthat/test_xml_data.r b/tests/testthat/test_xml_data.r
@@ -1,5 +1,27 @@
 context("Tests of the xml parser")
 
+test_that("check original filename",
+{
+  # with no changes
+  file_input_orig <- "and_its_file.xml"
+  file_input <- paste("/this/is/a/path/", file_input_orig, sep="")
+  expect_equal(extract_file_origin(file_input),file_input_orig)
+
+  # with a suffix
+  file_input <- "/this/is/a/path/and_its_file.xml_00.part"
+  expect_equal(extract_file_origin(file_input),file_input_orig)
+
+  # with multiple xmls
+  file_input <- "/this/is/a/path/and_its_file.xml_00.xml.part"
+  expect_equal(extract_file_origin(file_input),file_input_orig)
+
+  # extracting different extension
+  file_input <- "/this/is/a/path/and_its_file.xml.txt.part"
+  expect_equal(extract_file_origin(file_input, removestr='.txt'),
+               paste(file_input_orig, ".txt", sep=""))
+})
+
+
 test_that("load xml file", 
 {
 # ccdata ccd is loaded as a global variable.