Skip to content
This repository has been archived by the owner on Aug 23, 2022. It is now read-only.

Commit

Permalink
merge with master
Browse files Browse the repository at this point in the history
  • Loading branch information
sinanshi committed Jun 30, 2016
2 parents 1cc6ada + 962832d commit 515050d
Show file tree
Hide file tree
Showing 10 changed files with 174 additions and 43 deletions.
2 changes: 1 addition & 1 deletion R/range.filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ ccTable$methods(
filter.ranges = function(select='red') {
rgnum <- list('red'=1, 'amber'=2, 'green'=3)
inselectrange <- function(x, ...) {
x > rgnum[[select]]
x >= rgnum[[select]]
}

if(is.null(.self$dquality$range) ||
Expand Down
24 changes: 19 additions & 5 deletions R/xml2ccdata.r
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,32 @@ getXmlepisode <- function(xml.root, id) {
xml.root[[1]][[2]][[id]]
}

#' Extract the original file name from a path and file removing
#' all the suffixes.
#' @param pathfile a particular file name which may have a suffix
#' @param removestr last bit from the original filename
#' @return string
#' @export extract_file_origin
extract_file_origin <- function(pathfile, removestr='.xml'){
split_path <- unlist(strsplit(pathfile, "/"))
filename <- split_path[length(split_path)]
original <- unlist(strsplit(filename, removestr))
return(paste(original[1], removestr, sep=""))
}


#' convert xml data to ccdata format
#' @param file xml file name
#' @return ccdata
#' @export xml2Data
xml2Data <- function (file, select.episode=NULL, quiet=TRUE, xml=NULL,
file_origin="NA", parse_time=Sys.time()){
if (is.null(xml)) {
split_file_name <- unlist(strsplit(file, "/"))
file_origin <- split_file_name[length(split_file_name)]

xml <- xmlLoad(file)
if (is.null(xml)) {
if (file_origin == "NA") {
file_origin <- extract_file_origin(file)
}
xml <- xmlLoad(file)
}

episode.num <- xmlSize(xml[[1]][[2]])
if(is.null(select.episode))
Expand Down
3 changes: 1 addition & 2 deletions example/clean_data.r
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ if (!exists("ccd_delta_num"))
load('../data/delta_num.Rdata')
ccd <- ccd_delta_num

dt.sofa <- ccTable(conf=yaml.load_file('tests/data/test_sofa.yml'), record=ccd)
# create table with all selected items in yaml conf with cadance of 1 hour.
dt.sofa <- ccTable(conf=yaml.load_file('tests/data/ANALYSIS_REF.yaml'), record=ccd)
dt.sofa$create.table(freq=1)
dt.sofa$filter.ranges()
dt.sofa$filter.category()
Expand Down
64 changes: 48 additions & 16 deletions example/pipeline/break_into.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,68 @@ then
exit 1
fi

default_ext='partxml'

# change the end subject for something different - <tataa>
sed -e 's|</d:subject>|<tataaa>|' $1 > $1.tmp
# <d:xx> or <xx> file?
dchar="d:"
dchar_exist=$(head -2 $1 | grep -c "<${dchar}")
if [ ${dchar_exist} -eq 0 ]
then
dchar=""
fi

subject="${dchar}subject"

# change the end subject for something different - <cut_here>
# so it is not counted in the awk below.
sed -e 's|</'"${subject}"'>|<cut_here>\n|' ${1} > ${1}.tmp

# Break the file into chuncks where <d:subject> occurs.
awk '/\<d:subject\>/ { delim++ } {file = sprintf("chunks_%s.txt", int(delim/'$2')); print >> file; }' $1.tmp
# Break the file into chunks where <${subject}> occurs.
# Each time <subject> is found, delim will increase
# if delim/maxpatients (2nd argument) == 1 then
# create a new file.
# initialising delim as -1 so the first file includes the
# number of subjects asked.
awk 'BEGIN {delim=-1} \
/\<'"${subject}"'\>/ { delim++ } \
{file = sprintf("'${1}'_%s.'${default_ext}'", int(delim/'${2}'));\
print >> file; } \
END { print "'${1}' has ", delim+1, "subjects"}' ${1}.tmp

firstline=$(head -n1 chunks_0.txt)
lastline="</d:data></d:context></d:document>"
nfiles=$(expr `ls chunks_* | wc -l` - 1)

# extract the header of the file with its meta.
# - extract everything till the first <subject> (what's used to separate the file)
# - remove the instance for <subject> so it's not repeated when inserted.
# - remove all no printing characters - it seems there's one making the insertion to
# fail afterwards.
# head won't work because some files run over multiple lines
firstlines=$(sed -n '1,/<'"${subject}"'>/p' ${1}_0.${default_ext} | \
sed 's/<'"${subject}"'>//' | tr -dc '[:print:]')

lastline="</${dchar}data></${dchar}context></${dchar}document>"
nfiles=$(ls "${1}"_* | wc -l)

# loop over all the files to add header and footer for each file that needs it
for i in $(seq 0 ${nfiles})
for ((i=0; i<${nfiles}; i++))
do
echo chunks_${i}
output=${1}_${i}.${default_ext}
# replace the label changed before
sed -i 's|<cut_here>|</'"${subject}"'>|' ${output}

sed -i 's|<tataaa>|</d:subject>|' chunks_${i}.txt

if [ $i -lt ${nfiles} ]
# add footer to the files
if [ $i -lt $((${nfiles} - 1)) ]
then
echo "$lastline" >> chunks_${i}.txt
echo "$lastline" >> ${output}
fi


# add header to the files
if [ $i -gt 0 ]
then
sed -i '1s|^|'"$firstline"'|' chunks_${i}.txt
sed -i '1s|^|'"${firstlines}"'|' ${output}
fi

done

rm $1.tmp
# Remove the temporary file used
rm ${1}.tmp

6 changes: 4 additions & 2 deletions example/pipeline/combine_data.r
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/Rscript

library(ccdata)

args = commandArgs(trailingOnly=TRUE)

institute <- c("CUH", "Oxford", "GSTT", "imperial", "UCLH")
#institute <- "imperial"

data<-dir(institute, full.name=TRUE)[grep(".RData", dir(institute,
full.name=TRUE))]
Expand All @@ -16,4 +18,4 @@ for (d in data) {

ccd <- new

save(ccd, file="all_patients.Rdata")
save(ccd, file=args[1])
14 changes: 3 additions & 11 deletions example/pipeline/extract_data.r
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,7 @@
library(ccdata)

args = commandArgs(trailingOnly=TRUE)
r <- xmlLoad(args[1])
npatient <- length(names(r[[1]][[2]]))

step_iterator <- seq(1, npatient, by=100)
steps <- c(step_iterator, npatient)


for (i in seq(step_iterator)) {
cat("extract from", c(steps[i], "to", steps[i+1]-1, "\n"))
ccd <- xml2Data(xml=r, file_origin=args[1], select.episode=seq(steps[i], steps[i+1]-1), quiet=FALSE)
save(ccd, file=paste(args[1], steps[i], steps[i+1]-1, ".RData", sep="_"))
}
cat("extract patients from", c(args[1], "\n"))
ccd <- xml2Data(args[1])
save(ccd, file=paste(args[1], ".RData", sep="_"))
64 changes: 64 additions & 0 deletions example/pipeline/pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Pipeline code to generate the ccdata structure


DEFAULT_SPACES='xml'
allPatients='all_patients.RData'
allPatients_untime='delta_num.RData'

function remove_spaces (){
mv "${1}" "${1// /_}"
}
# make the function available as a command so find can use it.
export -f remove_spaces

function remove_spaces_ext (){
if [ -z "$1" ]
then
echo "-- removing spaces on \"${DEFAULT_SPACES}\" files -- "
else
echo "-- removing spaces on \"${1}\" files -- "
fi

extension=${1-$DEFAULT_SPACES}

FILES_spaced_num=$(find ./ -type f -iname '* *.'"${extension}" | wc -l)
find ./ -type f -iname '*\ *.'"${extension}" -exec bash -c 'remove_spaces "{}"' \;
# if you try to do this as a loop read:
# http://mywiki.wooledge.org/BashPitfalls#for_i_in_.24.28ls_.2A.mp3.29

echo "-- converted ${FILES_spaced_num} files --"
}


#============================================================
# remove spaces from file names, otherwise xargs won't parallelise
#============================================================
remove_spaces_ext

#============================================================
# Break files into smaller chunks in parallel
# - find the files, sort them by size (%k)
# - extract the file names and run break_into 4 at a time
# => filename.xml_xx.xml; where xx is a non padded number
#============================================================
find ./ -type f -iname '*.xml' -printf "%k %p\n" | sort -nr \
| awk '{print $2}' | xargs -n1 -P 4 -I % ./break_into.sh % 3

#============================================================
# Convert each portion to ccdata
#============================================================
find ./ -type f -iname '*.partxml' | xargs -n1 -P 4 ./extract_data.r

#============================================================
# Combine all the files
#============================================================
./combine_data.r ${allPatients}

#============================================================
# Anonymise data removing timestamp
#============================================================
./untimeit.r ${allPatients} ${allPatients_untime}

echo "Files ${allPatients} and ${allPatients_untime} created."
6 changes: 0 additions & 6 deletions example/pipeline/pipline.r

This file was deleted.

12 changes: 12 additions & 0 deletions example/pipeline/untimeit.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/Rscript

library(ccdata)

args = commandArgs(trailingOnly=TRUE)

load(args[1])
ccd <- reindexRecord(ccd)
ccd <- deltaTime(ccd, anonymised=T)
ccd <- uniquePatients(ccd)
ccd_delta_num <- ccd
save(ccd_delta_num, file=args[2])
22 changes: 22 additions & 0 deletions tests/testthat/test_xml_data.r
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
context("Tests of the xml parser")

test_that("check original filename",
{
# with no changes
file_input_orig <- "and_its_file.xml"
file_input <- paste("/this/is/a/path/", file_input_orig, sep="")
expect_equal(extract_file_origin(file_input),file_input_orig)

# with a suffix
file_input <- "/this/is/a/path/and_its_file.xml_00.part"
expect_equal(extract_file_origin(file_input),file_input_orig)

# with multiple xmls
file_input <- "/this/is/a/path/and_its_file.xml_00.xml.part"
expect_equal(extract_file_origin(file_input),file_input_orig)

# extracting different extension
file_input <- "/this/is/a/path/and_its_file.xml.txt.part"
expect_equal(extract_file_origin(file_input, removestr='.txt'),
paste(file_input_orig, ".txt", sep=""))
})


test_that("load xml file",
{
# ccdata ccd is loaded as a global variable.
Expand Down

0 comments on commit 515050d

Please sign in to comment.