Skip to content

Commit

Permalink
wrcc and airsis optimization:
Browse files Browse the repository at this point in the history
 * startdate as first argument in wrcc_downloadData()
 * handling NA values in df$Type during in ~QualityControl()
 * got rid of inefficient sapply() in monitor_subset()
  • Loading branch information
jonathancallahan committed Feb 6, 2017
1 parent 8d38c94 commit 504a9ed
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 64 deletions.
10 changes: 5 additions & 5 deletions R/airsis_EBAMQualityControl.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ airsis_EBAMQualityControl <- function(df,

# Latitude and longitude must be in range
if (remove_Lon_zero) {
goodLonMask <- !is.na(df$Longitude) & df$Longitude >= valid_Longitude[1] & df$Longitude <= valid_Longitude[2] & df$Longitude != 0
goodLonMask <- !is.na(df$Longitude) & (df$Longitude >= valid_Longitude[1]) & (df$Longitude <= valid_Longitude[2]) & (df$Longitude != 0)
} else {
goodLonMask <- !is.na(df$Longitude) & df$Longitude >= valid_Longitude[1] & df$Longitude <= valid_Longitude[2]
goodLonMask <- !is.na(df$Longitude) & (df$Longitude >= valid_Longitude[1]) & (df$Longitude <= valid_Longitude[2])
}

if (remove_Lat_zero) {
goodLatMask <- !is.na(df$Latitude) & df$Latitude >= valid_Latitude[1] & df$Latitude <= valid_Latitude[2] & df$Latitude != 0
goodLatMask <- !is.na(df$Latitude) & (df$Latitude >= valid_Latitude[1]) & (df$Latitude <= valid_Latitude[2]) & (df$Latitude != 0)
} else {
goodLatMask <- !is.na(df$Latitude) & df$Latitude >= valid_Latitude[1] & df$Latitude <= valid_Latitude[2]
goodLatMask <- !is.na(df$Latitude) & (df$Latitude >= valid_Latitude[1]) & (df$Latitude <= valid_Latitude[2])
}

badRows <- !(goodLonMask & goodLatMask)
Expand Down Expand Up @@ -88,7 +88,7 @@ airsis_EBAMQualityControl <- function(df,

# ----- Type ----------------------------------------------------------------

goodTypeMask <- !is.na(df$Type) & df$Type == "PM 2.5"
goodTypeMask <- !is.na(df$Type) & (df$Type == "PM 2.5")

badRows <- !goodTypeMask
badRowCount <- sum(badRows)
Expand Down
10 changes: 5 additions & 5 deletions R/airsis_ESAMQualityControl.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,20 @@ airsis_ESAMQualityControl <- function(df,

# Latitude and longitude must be in range
if (remove_Lon_zero) {
goodLonMask <- !is.na(df$Longitude) & df$Longitude >= valid_Longitude[1] & df$Longitude <= valid_Longitude[2] & df$Longitude != 0
goodLonMask <- !is.na(df$Longitude) & (df$Longitude >= valid_Longitude[1]) & (df$Longitude <= valid_Longitude[2]) & (df$Longitude != 0)
} else {
goodLonMask <- !is.na(df$Longitude) & df$Longitude >= valid_Longitude[1] & df$Longitude <= valid_Longitude[2]
goodLonMask <- !is.na(df$Longitude) & (df$Longitude >= valid_Longitude[1]) & (df$Longitude <= valid_Longitude[2])
}

if (remove_Lat_zero) {
goodLatMask <- !is.na(df$Latitude) & df$Latitude >= valid_Latitude[1] & df$Latitude <= valid_Latitude[2] & df$Latitude != 0
goodLatMask <- !is.na(df$Latitude) & (df$Latitude >= valid_Latitude[1]) & (df$Latitude <= valid_Latitude[2]) & (df$Latitude != 0)
} else {
goodLatMask <- !is.na(df$Latitude) & df$Latitude >= valid_Latitude[1] & df$Latitude <= valid_Latitude[2]
goodLatMask <- !is.na(df$Latitude) & (df$Latitude >= valid_Latitude[1]) & (df$Latitude <= valid_Latitude[2])
}

badRows <- !(goodLonMask & goodLatMask)
badRowCount <- sum(badRows)
if (badRowCount > 0) {
if ( badRowCount > 0 ) {
logger.info("Discarding %s rows with invalid location information", badRowCount)
badLocations <- paste('(',df$Longitude[badRows],',',df$Latitude[badRows],')',sep='')
logger.debug("Bad locations: %s", paste0(badLocations, collapse=", "))
Expand Down
71 changes: 37 additions & 34 deletions R/monitor_subsetData.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,38 +58,44 @@ monitor_subsetData <- function(data, tlim=NULL, vlim=NULL, monitorIDs=NULL,
}

# If specified, remove any data columns that have no valid data after time range subsetting
if ( dropMonitors & !is.null(dim(data[,-1])) ) {
if ( dropMonitors ) {

anyMask <- c(TRUE, apply(data[,-1],2,function(x) { any(!is.na(x),na.rm=TRUE) }))
# Sanity check
if ( sum(anyMask) == 1 ) {
# All data missing, only 'datetime' has valid values
warning("All data are missing values.")
return(NULL)
}
data <- data[,anyMask]

if ( !is.null(vlim) ) {
# NOTE: The apply() function converts the first argument to a matrix.
# NOTE: If we pass in a dataframe whose first column is POSIXct then things fail (Is it converting all subsequent columns to POSIXct?)
# NOTE: Therefore, we strip off the first column when we use apply() and then add it back
vlimMask <- apply(data[,-1], 2, function(x) { any((x > vlim[1]) & (x <= vlim[2]), na.rm=TRUE) } )
data <- data[,c(TRUE,vlimMask)]
}

} else if ( dropMonitors & is.null(dim(data[,-1])) ) {

anyLogical <- c(TRUE, any(!is.na(data[,-1]),na.rm=TRUE))
data <- data[,anyLogical]

if ( !is.null(vlim) ) {
if ( ncol(data) > 2 ) {
# Multiple monitors, we can use apply() without worrying our subsetting will return a vector

anyMask <- c(TRUE, apply(data[,-1],2,function(x) { any(!is.na(x),na.rm=TRUE) }))
# Sanity check
if ( sum(anyMask) == 1 ) {
# All data missing, only 'datetime' has valid values
warning("All data are missing values.")
return(NULL)
}
data <- data[,anyMask]

if ( !is.null(vlim) ) {
# NOTE: The apply() function converts the first argument to a matrix.
# NOTE: If we pass in a dataframe whose first column is POSIXct then things fail (Is it converting all subsequent columns to POSIXct?)
# NOTE: Therefore, we strip off the first column when we use apply() and then add it back
vlimMask <- apply(data[,-1], 2, function(x) { any((x > vlim[1]) & (x <= vlim[2]), na.rm=TRUE) } )
data <- data[,c(TRUE,vlimMask)]
}

} else {
# Need to be careful with a single monitor

anyMask <- c(TRUE, any(!is.na(data[,-1]),na.rm=TRUE))
data <- data[,anyMask]

dataParam <- data[,-1]

dataParam[dataParam > vlim[2] & !is.na(dataParam)] <- NA
dataParam[dataParam <= vlim[1] & !is.na(dataParam)] <- NA

data[,-1] <- dataParam
if ( !is.null(vlim) ) {

dataParam <- data[,-1]

dataParam[dataParam > vlim[2] & !is.na(dataParam)] <- NA
dataParam[dataParam <= vlim[1] & !is.na(dataParam)] <- NA

data[,-1] <- dataParam

}

}

Expand All @@ -106,10 +112,7 @@ monitor_subsetData <- function(data, tlim=NULL, vlim=NULL, monitorIDs=NULL,
# TODO: 2017-01-06 Are we still using rownames in the 'data' dataframe?

# Add back YYYYmmddHHMM rownames discarded by dplyr::filter
rowNames <- sapply(data$datetime, function(x){ stringr::str_replace_all(x, "-", "") } )
rowNames <- sapply(rowNames, function(x){ stringr::str_replace(x, " ", "") } )
rowNames <- sapply(rowNames, function(x){ stringr::str_split_fixed(x, ":", 3)[1] } )
rownames(data) <- rowNames
rownames(data) <- strftime(data$datetime, "%Y%m%d%H", tz="UTC")

return(data)

Expand Down
12 changes: 6 additions & 6 deletions R/wrcc_EBAMQualityControl.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,20 @@ wrcc_EBAMQualityControl <- function(df,

# Latitude and longitude must be in range
if (remove_Lon_zero) {
goodLonMask <- !is.na(df$GPSLon) & df$GPSLon >= valid_Longitude[1] & df$GPSLon <= valid_Longitude[2] & df$GPSLon != 0
goodLonMask <- !is.na(df$GPSLon) & (df$GPSLon >= valid_Longitude[1]) & (df$GPSLon <= valid_Longitude[2]) & (df$GPSLon != 0)
} else {
goodLonMask <- !is.na(df$GPSLon) & df$GPSLon >= valid_Longitude[1] & df$GPSLon <= valid_Longitude[2]
goodLonMask <- !is.na(df$GPSLon) & (df$GPSLon >= valid_Longitude[1]) & (df$GPSLon <= valid_Longitude[2])
}

if (remove_Lat_zero) {
goodLatMask <- !is.na(df$GPSLat) & df$GPSLat >= valid_Latitude[1] & df$GPSLat <= valid_Latitude[2] & df$GPSLat != 0
goodLatMask <- !is.na(df$GPSLat) & (df$GPSLat >= valid_Latitude[1]) & (df$GPSLat <= valid_Latitude[2]) & (df$GPSLat != 0)
} else {
goodLatMask <- !is.na(df$GPSLat) & df$GPSLat >= valid_Latitude[1] & df$GPSLat <= valid_Latitude[2]
goodLatMask <- !is.na(df$GPSLat) & (df$GPSLat >= valid_Latitude[1]) & (df$GPSLat <= valid_Latitude[2])
}

badRows <- !(goodLonMask & goodLatMask)
badRowCount <- sum(badRows)
if (badRowCount > 0) {
if ( badRowCount > 0 ) {
logger.info("Discarding %s rows with invalid location information", badRowCount)
badLocations <- paste('(',df$GPSLon[badRows],',',df$GPSLat[badRows],')',sep='')
logger.debug("Bad locations: %s", paste0(badLocations, collapse=", "))
Expand All @@ -94,7 +94,7 @@ wrcc_EBAMQualityControl <- function(df,
# ----- Type ----------------------------------------------------------------

# Type: 0=E-BAM PM2.5, 1=E-BAM PM10, 9=E-Sampler. We only want PM2.5 measurements
goodTypeMask <- df$Type == 0
goodTypeMask <- !is.na(df$Type) & (df$Type == 0)
badRows <- !goodTypeMask
badRowCount <- sum(badRows)
if (badRowCount > 0) {
Expand Down
10 changes: 5 additions & 5 deletions R/wrcc_ESAMQualityControl.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,20 @@ wrcc_ESAMQualityControl <- function(df,

# Latitude and longitude must be in range
if (remove_Lon_zero) {
goodLonMask <- !is.na(df$GPSLon) & df$GPSLon >= valid_Longitude[1] & df$GPSLon <= valid_Longitude[2] & df$GPSLon != 0
goodLonMask <- !is.na(df$GPSLon) & df$GPSLon >= valid_Longitude[1] & df$GPSLon <= valid_Longitude[2] & (df$GPSLon != 0)
} else {
goodLonMask <- !is.na(df$GPSLon) & df$GPSLon >= valid_Longitude[1] & df$GPSLon <= valid_Longitude[2]
}

if (remove_Lat_zero) {
goodLatMask <- !is.na(df$GPSLat) & df$GPSLat >= valid_Latitude[1] & df$GPSLat <= valid_Latitude[2] & df$GPSLat != 0
goodLatMask <- !is.na(df$GPSLat) & df$GPSLat >= valid_Latitude[1] & df$GPSLat <= valid_Latitude[2] & (df$GPSLat != 0)
} else {
goodLatMask <- !is.na(df$GPSLat) & df$GPSLat >= valid_Latitude[1] & df$GPSLat <= valid_Latitude[2]
}

badRows <- !(goodLonMask & goodLatMask)
badRowCount <- sum(badRows)
if (badRowCount > 0) {
if ( badRowCount > 0 ) {
logger.info("Discarding %s rows with invalid location information", badRowCount)
badLocations <- paste('(',df$GPSLon[badRows],',',df$GPSLat[badRows],')',sep='')
logger.debug("Bad locations: %s", paste0(badLocations, collapse=", "))
Expand All @@ -94,10 +94,10 @@ wrcc_ESAMQualityControl <- function(df,
# ----- Type ----------------------------------------------------------------

# Type: 0=E-BAM PM2.5, 1=E-BAM PM10, 9=E-Sampler. We only want PM2.5 measurements
goodTypeMask <- df$Type == 9
goodTypeMask <- !is.na(df$Type) & (df$Type == 9)
badRows <- !goodTypeMask
badRowCount <- sum(badRows)
if (badRowCount > 0) {
if ( badRowCount > 0 ) {
logger.info("Discarding %s rows with invalid Type information", badRowCount)
logger.debug("Bad Types: %s", paste0(sort(df$Type[badRows]), collapse=", "))
}
Expand Down
12 changes: 9 additions & 3 deletions R/wrcc_downloadData.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
# USFSRegionalMonitors <- c()
# MiscellaneousMonitors <- c()

wrcc_downloadData <- function(stationID=NULL, startdate=20100101,
wrcc_downloadData <- function(startdate=20100101,
enddate=strftime(lubridate::now(),"%Y%m%d",tz="GMT"),
stationID=NULL,
baseUrl="http://www.wrcc.dri.edu/cgi-bin/wea_list2.pl") {

# Sanity check
Expand Down Expand Up @@ -73,8 +74,13 @@ wrcc_downloadData <- function(stationID=NULL, startdate=20100101,

if ( class(rawBytes) == "character" ) {
logger.debug(rawBytes)
logger.error("WRCC FTP request returns an error")
stop(rawBytes)
if ( stringr::str_detect(rawBytes, "WRCC data access information") ) {
logger.warn("No data available")
stop("No data available")
} else {
logger.error("WRCC FTP request returns an error")
stop(rawBytes)
}
}

# Convert raw bytes into a string
Expand Down
4 changes: 2 additions & 2 deletions R/wrcc_parseData.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ wrcc_parseData <- function(fileString) {

# Sanity check
if ( length(monitorTypeCode) > 1 ) {
logger.warn("More than one monitor type detected: %s", paste(monitorTypeCode,sep=", "))
logger.warn("More than one monitor type detected: %s", paste(monitorTypeCode,collapse=", "))
# Pick the most common Type
typeTable <- table(monitorTypeCode)
typeTable <- table(df$Type)
monitorTypeCode <- names(typeTable)[which(typeTable == max(typeTable))]
}

Expand Down
8 changes: 4 additions & 4 deletions man/wrcc_downloadData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 504a9ed

Please sign in to comment.