diff --git a/.gitignore b/.gitignore
index 21045d5..57c4cb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ markdown/pairs_possible_matches.rds
markdown/rpairs_epiwt.rds
markdown/rpairs_jar.rds
+
+NOTES.txt
diff --git a/markdown/data_medstar_epcr_02_variable_management.Rmd b/markdown/data_medstar_epcr_02_variable_management.Rmd
index 97144eb..8837e9c 100644
--- a/markdown/data_medstar_epcr_02_variable_management.Rmd
+++ b/markdown/data_medstar_epcr_02_variable_management.Rmd
@@ -1,11 +1,6 @@
---
title: "Manage Variables in MedStar EPCR Data"
date: "Created: 2018-12-26
Updated: `r Sys.Date()`"
-output:
- html_notebook:
- toc: true
- toc_float: true
- css: custom-css.css
---
# Overview
@@ -24,28 +19,21 @@ Sys.setenv(TZ = "US/Central")
```{r message=FALSE}
library(tidyverse)
-library(bfuncs)
```
medstar_epcr.feather was created in data_medstar_epcr_01_import.Rmd
-```{r}
-medstar_epcr <- feather::read_feather("/Volumes/sph_research/DETECT/one_year_data/medstar_epcr_01_import.feather")
+```{bash}
+open 'smb://uctnascifs.uthouston.edu/sph_research/DETECT'
```
```{r}
-about_data(medstar_epcr) # 35,557 observations and 32 variables
+medstar_epcr <- feather::read_feather("/Volumes/DETECT/one_year_data/medstar_epcr_01_import.feather")
```
-[top](#top)
-
-
-
-
-
-
-
-
+```{r}
+dim(medstar_epcr) # 35,557 32
+```
# Standardize character strings
@@ -72,14 +60,6 @@ rm(vars)
[top](#top)
-
-
-
-
-
-
-
-
# Remove "city of" from address_city value
```{r}
@@ -90,14 +70,6 @@ medstar_epcr <- medstar_epcr %>%
[top](#top)
-
-
-
-
-
-
-
-
# Separate names, dob's, and street addresses
* Some names have three parts (e.g., Mary Jo Blake). Here, we split up full name into first name and last name. For now, we ignore middle name(s). We may need to change this later.
@@ -119,19 +91,18 @@ medstar_epcr <- medstar_epcr %>%
)
```
+Replaces spaces with underscores in address street name.
+
```{r}
-about_data(medstar_epcr) # 35,557 observations and 39 variables
+medstar_epcr <- medstar_epcr %>%
+ mutate(
+ address_street_name = stringr::str_replace_all(address_street_name, "\\s", "_")
+ )
```
-[top](#top)
-
-
-
-
-
-
-
-
+```{r}
+dim(medstar_epcr) # 35,557 39
+```
# Recode categories
@@ -158,9 +129,7 @@ medstar_epcr <- medstar_epcr %>%
) %>%
mutate_at(
vars(starts_with("detect")),
- funs(
- if_else(. == "N/A", NA_character_, .)
- )
+ ~ if_else(. == "N/A", NA_character_, .)
)
```
@@ -185,19 +154,9 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,557 observations and 40 variables
+dim(medstar_epcr) # 35,557 40
```
-[top](#top)
-
-
-
-
-
-
-
-
-
# Create indicator for completed DETECT screening
@@ -216,19 +175,9 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,557 observations and 56 variables
+dim(medstar_epcr) # 35,557 56
```
-[top](#top)
-
-
-
-
-
-
-
-
-
# Process numeric variables
@@ -240,19 +189,9 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,557 observations and 56 variables
+dim(medstar_epcr) # 35,557 56
```
-[top](#top)
-
-
-
-
-
-
-
-
-
# Duplicate (almost) rows
@@ -282,7 +221,7 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,557 observations and 57 variables
+dim(medstar_epcr) # 35,557 57
```
How many pairs of duplicate pcr numbers are there?
@@ -302,14 +241,14 @@ For each of those pcr numbers, if the only thing that differs between the two ro
So, for each variable of interest, create a dummy variable that indicates if if values are different within incident pcr number
```{r}
-medstar_epcr <- medstar_epcr %>%
+medstar_epcr <- medstar_epcr %>%
mutate_at(
.vars = vars(
arrival_time, response_num, incident_pcr, incident_complaint, age,
name_full, dob, address_street, address_city, address_state,
address_zip, gender, race, symptoms, crew_sig, disposition
),
- .funs = funs(diff = as.numeric(length(unique(.)) > 1))
+ .funs = list(diff = ~ as.numeric(length(unique(.)) > 1))
) %>%
ungroup()
```
@@ -325,8 +264,8 @@ medstar_epcr <- medstar_epcr %>%
```{r}
# Data checking
-# medstar_epcr %>%
-# filter(pcr_dup) %>%
+# medstar_epcr %>%
+# filter(pcr_dup) %>%
# select(incident_pcr, pcr_dup, ends_with("_diff"), aps_report, answered_count, diff_count)
```
@@ -365,7 +304,7 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,556 observations and 58 variables
+dim(medstar_epcr) # 35,556 58
```
@@ -389,7 +328,7 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 35,555 observations and 58 variables
+dim(medstar_epcr) # 35,555 58
```
@@ -449,39 +388,19 @@ medstar_epcr <- medstar_epcr %>%
```
```{r}
-about_data(medstar_epcr) # 28,228 observations and 56 variables
+dim(medstar_epcr) # 28,228 56
```
-[top](#top)
-
-
-
-
-
-
-
-
-
# Save data
```{r}
-feather::write_feather(
+readr::write_rds(
medstar_epcr,
- "/Volumes/sph_research/Detect/one_year_data/medstar_epcr_02_variable_management.feather"
+ "/Volumes/DETECT/one_year_data/medstar_epcr_02_variable_management.rds"
)
```
-[top](#top)
-
-
-
-
-
-
-
-
-
# Session information
diff --git a/markdown/data_medstar_epcr_02_variable_management.nb.html b/markdown/data_medstar_epcr_02_variable_management.nb.html
deleted file mode 100644
index b449660..0000000
--- a/markdown/data_medstar_epcr_02_variable_management.nb.html
+++ /dev/null
@@ -1,873 +0,0 @@
-
-
-
-
-
The raw MedStar epcr data was imported in data_medstar_epcr_01_import.Rmd
In this file we prepare the data for analysis
library(tidyverse)
-library(bfuncs)
-
-
-
-medstar_epcr.feather was created in data_medstar_epcr_01_import.Rmd
- - - -medstar_epcr <- feather::read_feather("/Volumes/sph_research/DETECT/one_year_data/medstar_epcr_01_import.feather")
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 32 variables
-
-
-[1] "35,557 observations and 32 variables"
-
-
-
-
-Because we will merge this data with other data sets in the future based on character strings (e.g., name), we need to go ahead and standardize their formats here. This will prevent mismatches during the merges. Specifically, we:
-vars <- quos(name_full, address_street, address_city, address_state)
-medstar_epcr <- medstar_epcr %>%
- mutate_at(vars(!!! vars), tolower) %>%
- mutate_at(vars(!!! vars), stringr::str_replace_all, "[^a-zA-Z\\d\\s]", " ") %>%
- mutate_at(vars(!!! vars), stringr::str_replace, "[[:blank:]]$", "") %>%
- mutate_at(vars(!!! vars), stringr::str_replace_all, "[[:blank:]]{2,}", " ")
-rm(vars)
-
-
-
-
-medstar_epcr <- medstar_epcr %>%
- mutate(address_city = stringr::str_replace(address_city, "city of ", ""))
-
-
-
-
-Some names have three parts (e.g., Mary Jo Blake). Here, we split up full name into first name and last name. For now, we ignore middle name(s). We may need to change this later.
We also separate dob into its component parts: month, day, year.
We also separate the street address into the number part and the street name part
medstar_epcr <- medstar_epcr %>%
- mutate(
- name_first = stringr::str_extract(name_full, "\\w+(?=[[:blank:]])"),
- name_last = stringr::str_extract(name_full, "\\S*$"),
- birth_mnth = lubridate::month(dob),
- birth_day = lubridate::day(dob),
- birth_year = lubridate::year(dob),
- address_num = stringr::str_extract(address_street, "^\\d{1,5}") %>% as.numeric(),
- address_street_name = stringr::str_trim(str_replace(address_street, "^\\d{1,5}", ""))
- )
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 39 variables
-
-
-[1] "35,557 observations and 39 variables"
-
-
-
-
-For select variables, recode variants of missing data (e.g. “Unknown”) to NA
For select variables collapse categories
medstar_epcr <- medstar_epcr %>%
- mutate(
- gender = case_when(
- gender == "Unknown (Unable to Determine)" ~ NA_character_,
- gender == "Not Applicable" ~ NA_character_,
- TRUE ~ gender
- ),
- race = case_when(
- race == "Not Applicable" ~ NA_character_,
- race == "Not Recorded" ~ NA_character_,
- TRUE ~ race
- )
- ) %>%
- mutate_at(
- vars(starts_with("detect")),
- funs(
- if_else(. == "N/A", NA_character_, .)
- )
- )
-
-
-
-medstar_epcr <- medstar_epcr %>%
- mutate(
- race_8cat = case_when(
- race == "American Indian or Alaska Native" ~ "American Indian or Alaska Native - non-Hispanic",
- race == "Asian" ~ "Asian - non-Hispanic",
- race == "Black or African American" ~ "Black or African American - non-Hispanic",
- race == "Hispanic or Latino" ~ "Hispanic or Latino, Any Race",
- race == "Native Hawaiian or Other Pacific Islander" ~ "Native Hawaiian or Other Pacific Islander - non-Hispanic",
- race == "White" ~ "White - non-Hispanic",
- stringr::str_detect(race, "Hispanic") ~ "Hispanic or Latino, Any Race",
- is.na(race) ~ NA_character_,
- TRUE ~ "Other Race - non-Hispanic"
- )
- )
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 40 variables
-
-
-[1] "35,557 observations and 40 variables"
-
-
-
-
-medstar_epcr <- medstar_epcr %>%
- mutate_at(
- .vars = vars(starts_with("detect_")),
- .funs = funs(answered = !is.na(.))
- ) %>%
- mutate(
- answered_count = select(., ends_with("answered")) %>% rowSums()
- ) %>%
- mutate(
- screened = answered_count > 0
- )
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 56 variables
-
-
-[1] "35,557 observations and 56 variables"
-
-
-
-
-All numerical variables must be class numeric – as opposed to integer – to work in fastLink below.
- - - -medstar_epcr <- medstar_epcr %>%
- mutate_if(is.integer, as.numeric)
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 56 variables
-
-
-[1] "35,557 observations and 56 variables"
-
-
-
-
-There were some rows with the same pcr number and only differed by whether or not DETECT screenings were completed.
- - - -# Data checks
-# Multiple pcr's per response? Yes
-# medstar_epcr %>%
-# group_by(response_num) %>%
-# summarise(n = length(unique(incident_pcr))) %>%
-# filter(n > 1)
-# Multiple responses per pcr? No
-# medstar_epcr %>%
-# group_by(incident_pcr) %>%
-# summarise(n = length(unique(response_num))) %>%
-# filter(n > 1)
-
-
-
-Add a dummy variable that indicates whether or not the pcr number is duplicated
- - - -medstar_epcr <- medstar_epcr %>%
- group_by(incident_pcr) %>%
- mutate(pcr_dup = max(row_number()) > 1)
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,557 observations and 57 variables
-
-
-[1] "35,557 observations and 57 variables"
-
-
-
-How many pairs of duplicate pcr numbers are there?
- - - -medstar_epcr %>%
- group_by(incident_pcr) %>%
- summarise(duplicate_pairs = any(pcr_dup == TRUE)) %>%
- pull(duplicate_pairs) %>%
- sum() # 7,282
-
-
-[1] 7282
-
-
-
-There are 7,282 incident pcr numbers that are duplicated in the data.
-For each of those pcr numbers, if the only thing that differs between the two rows is that one has the DETECT tool filled out and the other doesn’t, then we only want to keep the row with the DETECT screening information.
-So, for each variable of interest, create a dummy variable that indicates if if values are different within incident pcr number
- - - -medstar_epcr <- medstar_epcr %>%
- mutate_at(
- .vars = vars(
- arrival_time, response_num, incident_pcr, incident_complaint, age,
- name_full, dob, address_street, address_city, address_state,
- address_zip, gender, race, symptoms, crew_sig, disposition
- ),
- .funs = funs(diff = as.numeric(length(unique(.)) > 1))
- ) %>%
- ungroup()
-
-
-
-Now, count the total number of differences between rows (within incident pcr number) for the variables above
- - - -medstar_epcr <- medstar_epcr %>%
- mutate(
- diff_count = select(., ends_with("_diff")) %>% rowSums()
- )
-
-
-
-
-
-
-# Data checking
-# medstar_epcr %>%
-# filter(pcr_dup) %>%
-# select(incident_pcr, pcr_dup, ends_with("_diff"), aps_report, answered_count, diff_count)
-
-
-
-Below we will need to drop selected rows from the data. Adding row numbers to the data set here will make it easier to do so without using any potential personal identifiers to filter the data.
- - - -medstar_epcr <- medstar_epcr %>% mutate(row = row_number()) %>% select(row, everything())
-
-
-
-Are there any cases where there is a duplicated incident pcr number and there are differences between the values in the rows other than DETECT screenings? Results are hidden to protect participant privacy.
- - - -medstar_epcr %>%
- filter(pcr_dup & diff_count > 0)
-
-
-
-Yes. There is one duplicated incident pcr with differing information between rows for race (row 16530 = White, row 16531 = Missing) and disposition (row 16530 = Transported, row 16531 = Canceled False Call). Additionally, row 16530 has a completed DETECT screening. Therefore, we will drop row 16531 and change pcr_dup to FALSE for row 16530.
- - - -medstar_epcr <- medstar_epcr %>%
- filter(row != 16531) %>%
- mutate(pcr_dup = if_else(row == 16530, FALSE, pcr_dup))
-
-
-
-At this point, all remaining duplicate incident pcr numbers have identical values for each of the variables compared above and diff_counts of 0. There is no need to keep those variables in the data.
- - - -medstar_epcr <- medstar_epcr %>%
- select(-ends_with("_diff"), -diff_count)
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,556 observations and 58 variables
-
-
-[1] "35,556 observations and 58 variables"
-
-
-
-Are there any cases where there is a duplicated incident pcr number and answered count is 0 for all rows? Results are hidden to protect participant privacy.
- - - -medstar_epcr %>%
- filter(pcr_dup) %>%
- group_by(incident_pcr) %>%
- filter(all(answered_count == 0))
-
-
-
-Yes, there is one such incident pcr number. The only difference between the two rows is that the value for aps_report_num in row 4467 is “homeless” and missing in row 4468. Therefore, we will drop row 4468 and change pcr_dup to FALSE for 4467.
- - - -medstar_epcr <- medstar_epcr %>%
- filter(row != 4468) %>%
- mutate(pcr_dup = if_else(row == 4467, FALSE, pcr_dup))
-
-
-
-
-
-
-about_data(medstar_epcr) # 35,555 observations and 58 variables
-
-
-[1] "35,555 observations and 58 variables"
-
-
-
-Are there any cases where there is a duplicated incident pcr number and answered counts differ, but the answered count is not 0 for any of the rows. In other words, a different number of screening items was answered. Results are hidden to protect participant privacy.
- - - -medstar_epcr %>%
- filter(pcr_dup) %>%
- group_by(incident_pcr) %>%
- filter(!any(answered_count == 0)) %>%
- ungroup() %>%
- # I already know there are no differences in name etc. We cleaned that up
- # above.
- select(row, name_full, crew_sig, aps_report_num, aps_report,
- starts_with("detect_"), answered_count, -ends_with("_answered"))
-
-
-
-Yes, there are 130 rows (63 unique incident pcr numbers) where more than one row has the DETECT screening filled out.
-There doesn’t appear to be an systematic differences between the rows. For example, the first row in a pair is always more complete or something like that. Therefore, we’re going to filter in stages in such a way as to maximize information retention.
-medstar_epcr <- medstar_epcr %>%
- arrange(desc(pcr_dup), incident_pcr, desc(answered_count), aps_report_num) %>%
- group_by(incident_pcr) %>%
- mutate(
- group_row = row_number(),
- keep_row = group_row == 1
- ) %>%
- ungroup() %>%
- arrange(row)
-
-
-
-NOTE: If there is only one row for a given incident pcr number, then it will always have keep_row == 1.
- - - -# Data checks
-# medstar_epcr %>%
-# filter(pcr_dup) %>%
-# select(row, row, incident_pcr, group_row, keep_row, answered_count, aps_report_num) %>%
-# group_by(incident_pcr) %>%
-# filter(any(group_row > 2))
-
-
-
-Drop duplicate rows and unneeded variables
- - - -medstar_epcr <- medstar_epcr %>%
- filter(keep_row) %>%
- select(-row, -pcr_dup, -group_row, -keep_row)
-
-
-
-
-
-
-about_data(medstar_epcr) # 28,228 observations and 56 variables
-
-
-[1] "28,228 observations and 56 variables"
-
-
-
-
-feather::write_feather(
- medstar_epcr,
- "/Volumes/sph_research/Detect/one_year_data/medstar_epcr_02_variable_management.feather"
-)
-
-
-
-
-rm(list = ls())
-
-
-
-
-
-
-R version 3.5.1 (2018-07-02)
-Platform: x86_64-apple-darwin15.6.0 (64-bit)
-Running under: macOS 10.14.2
-
-Matrix products: default
-BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
-LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
-
-locale:
-[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
-
-attached base packages:
-[1] stats graphics grDevices utils datasets methods base
-
-other attached packages:
- [1] bindrcpp_0.2.2 bfuncs_0.2.1 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5
- [7] readr_1.1.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
-
-loaded via a namespace (and not attached):
- [1] Rcpp_1.0.0 cellranger_1.1.0 pillar_1.3.0 compiler_3.5.1 plyr_1.8.4 bindr_0.1.1
- [7] tools_3.5.1 packrat_0.4.9-3 jsonlite_1.5 lubridate_1.7.4 nlme_3.1-137 gtable_0.2.0
-[13] lattice_0.20-35 pkgconfig_2.0.2 rlang_0.3.0.1 cli_1.0.1 rstudioapi_0.8 yaml_2.2.0
-[19] haven_1.1.2 withr_2.1.2 xml2_1.2.0 httr_1.3.1 knitr_1.20 hms_0.4.2
-[25] grid_3.5.1 tidyselect_0.2.5 glue_1.3.0 R6_2.3.0 readxl_1.1.0 feather_0.3.1
-[31] modelr_0.1.2 magrittr_1.5 backports_1.1.2 scales_1.0.0 rvest_0.3.2 assertthat_0.2.0
-[37] colorspace_1.3-2 stringi_1.2.4 lazyeval_0.2.1 munsell_0.5.0 broom_0.5.0 crayon_1.3.4
-
-
-