diff --git a/R/dam_locations.R b/R/dam_locations.R index 128f73e..f4c4a35 100644 --- a/R/dam_locations.R +++ b/R/dam_locations.R @@ -158,117 +158,212 @@ get_dam_locations <- function(dams, nid) { feature_data_source) } -# TODO: work out precise hydrologic locations. -get_hydrologic_locations <- function(dams, hydrologic_locations, nhdpv2_fline, - da_diff_thresh = 0.5, search_radius_m = 500, - max_matches_in_radius = 5) { +utility_link_dams <- function(to_link, dams, nhdpv2_fline, vaa, search_radius_m, max_matches_in_radius) { + new_hl <- nhdplusTools::get_flowline_index(nhdpv2_fline, + to_link, + search_radius = units::set_units( + search_radius_m, "m"), + max_matches = max_matches_in_radius) - v2_area <- select(nhdplusTools::get_vaa(), - nhdpv2_COMID = comid, - nhdpv2_totdasqkm = totdasqkm) + sf::st_drop_geometry(select(to_link, provider_id)) %>% + mutate(id = seq_len(nrow(.))) %>% + left_join(new_hl, by = "id") %>% + left_join(select(sf::st_drop_geometry(dams), + provider_id, drainage_area_sqkm), + by = "provider_id") %>% + left_join(select(sf::st_drop_geometry(nhdpv2_fline), + COMID, drainage_area_sqkm_nhdpv2 = TotDASqKM), + by = "COMID") %>% + left_join(select(vaa, COMID = comid, hydroseq), by = "COMID") %>% + mutate(da_diff = abs(drainage_area_sqkm - drainage_area_sqkm_nhdpv2) / drainage_area_sqkm) +} + +get_dam_network_location <- function(dams, update_index, nhdpv2_fline, vaa, + search_radius_m, max_matches_in_radius, index_type) { + linked_dams_base <- select(dams[update_index, ], provider_id, nhdpv2_COMID) - all_gages$nhdpv2_REACHCODE <- NA - all_gages$nhdpv2_REACH_measure <- NA - all_gages$nhdpv2_COMID <- NA + linked_dams <- linked_dams_base %>% + left_join(select(sf::st_drop_geometry(nhdpv2_fline), COMID, REACHCODE, FromMeas), + by = c("nhdpv2_COMID" = "COMID")) - for(hl in hydrologic_locations) { - - hl$locations <- hl$locations[hl$locations$provider_id %in% all_gages$provider_id, ] - - provider_selector <- all_gages$provider %in% hl$provider - - matcher <- match(hl$locations$provider_id, - all_gages$provider_id[provider_selector] - ) - - all_gages$nhdpv2_REACHCODE[provider_selector][matcher] <- - hl$locations$nhdpv2_REACHCODE - all_gages$nhdpv2_REACH_measure[provider_selector][matcher] <- - hl$locations$nhdpv2_REACH_measure - all_gages$nhdpv2_COMID[provider_selector][matcher] <- - hl$locations$nhdpv2_COMID - - # Some gages missing reachcode/measure but have COMID - update_index <- is.na(all_gages$nhdpv2_REACH_measure & !is.na(all_gages$nhdpv2_COMID)) - - if(any(update_index)) { - linked_gages <- select(all_gages[update_index, ], provider_id, nhdpv2_COMID) %>% - left_join(select(sf::st_drop_geometry(nhdpv2_fline), COMID, FromMeas), - by = c("nhdpv2_COMID" = "COMID")) - - all_gages$nhdpv2_REACH_measure[update_index] <- - linked_gages$FromMeas - } + # get precise network location + + fline_sites <- dplyr::filter(nhdpv2_fline, COMID %in% linked_dams$nhdpv2_COMID) |> + sf::st_transform(sf::st_crs(linked_dams)) |> + sf::st_cast("LINESTRING") + + linked_dams <- utility_link_dams(linked_dams, dams, fline_sites, vaa, + search_radius_m, max_matches_in_radius) + + linked_dams_dedup <- linked_dams %>% + left_join(select(sf::st_drop_geometry(dams), provider_id, nhdpv2_COMID), by = "provider_id") %>% + group_by(provider_id) %>% + filter(COMID == nhdpv2_COMID) %>% + ungroup() + + if(!any(duplicated(linked_dams_dedup$provider_id))) { + linked_dams_base <- linked_dams_base %>% + left_join(select(linked_dams_dedup, provider_id, REACHCODE, REACH_meas, drainage_area_sqkm_nhdpv2), + by = "provider_id") + } else { + stop() } - all_gages <- left_join(all_gages, v2_area, by = "nhdpv2_COMID") + if(!nrow(linked_dams_base) == sum(update_index)) stop() - diff_da <- abs(all_gages$nhdpv2_totdasqkm - - all_gages$drainage_area_sqkm) / - all_gages$drainage_area_sqkm + dams$nhdpv2_REACHCODE[update_index] <- + linked_dams_base$REACHCODE + dams$nhdpv2_REACH_measure[update_index] <- + linked_dams_base$REACH_meas - bad_da <- all_gages[!is.na(diff_da) & diff_da > da_diff_thresh, ] + dams$index_type[update_index] <- index_type - update_index <- which(is.na(all_gages$nhdpv2_COMID) | - all_gages$provider_id %in% bad_da$provider_id) + dams +} + +#' get dam hydrolocations +#' @param dams sf data.frame table of dams with best estimate of comid and drainage area +#' @param nhdpv2_fline sf data.frame of flowlines to index to +#' @param da_diff_thresh numeric between 0 an 1 +#' if the normalized difference between prior estimate +#' drainage area and the NHDPlusV2 modeled drainage area is greater than this, +#' the dam is not indexed to the network. +#' @param search_radius_m numeric distance to search from dam location to flowline +#' @param max_matches_in_radius maximum flowines to consider within search radius +#' @description +#' Given input with estimates of NHDPlusV2 comid and prior estimates of drainage area, +#' this function attempts to figure out which dams should be indexed to the network +#' vs which should be left associated to a comid because they drain a very small area +#' vs which should be left un indexed because uncertainty is too high. +#' +get_dam_hydrolocations <- function(dams, nhdpv2_fline, vaa, + da_diff_thresh = 0.1, + search_radius_m = 500, + max_matches_in_radius = 5) { + + dams$nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "", + dams$nhdpv2_COMID)) + + dams$nhdpv2_REACHCODE <- NA + dams$nhdpv2_REACH_measure <- NA + + dams$drainage_area_sqkm <- dams$drainage_area_sqkm_nawqa + dams$drainage_area_sqkm[is.na(dams$drainage_area_sqkm)] <- + dams$drainage_area_sqkm_nid[is.na(dams$drainage_area_sqkm)] + + dams <- left_join(dams, select(sf::st_drop_geometry(nhdpv2_fline), + nhdpv2_COMID = COMID, drainage_area_sqkm_nhdpv2 = TotDASqKM), + by = "nhdpv2_COMID") + + dams$drainage_area_sqkm[dams$drainage_area_sqkm == 0] <- NA + + # normalized difference in drainage area using NiD/NAWQA best estimate to normalize + # we can use this to evaluate whether what the COMID models is reasonable compared + # to what the NID and NAWQA has as an estimate. + dams$norm_diffda <- (dams$drainage_area_sqkm - dams$drainage_area_sqkm_nhdpv2) / + dams$drainage_area_sqkm + + # these are where we have a reasonable network-drainage area match + update_index <- is.na(dams$nhdpv2_REACH_measure) & !is.na(dams$nhdpv2_COMID) & + (!is.na(dams$norm_diffda) & abs(dams$norm_diffda) < da_diff_thresh) + + dams$index_type <- rep(NA_character_, nrow(dams)) + + if(any(update_index)) { + + dams <- get_dam_network_location(dams, update_index, nhdpv2_fline, vaa, + search_radius_m, max_matches_in_radius, + "nawqa_on_network_da_match") + } - no_location <- all_gages[update_index, ] + # these are where we have a small catchment worth indexing to despite a da mismatch + update_index <- is.na(dams$nhdpv2_REACH_measure) & # no reachcode measure yet + !is.na(dams$nhdpv2_COMID) & # has a comid + (!is.na(dams$norm_diffda) & # has a normalized area diff + abs(dams$norm_diffda) > da_diff_thresh & # diff is larger than thresh + dams$drainage_area_sqkm_nhdpv2 < 10) # but this is near a headwater - no_location <- st_transform(no_location, 5070) + if(any(update_index)) { + + dams <- get_dam_network_location(dams, update_index, nhdpv2_fline, vaa, + search_radius_m, max_matches_in_radius, + "nawqa_<10sqkm_da_mismatch") + } - new_hl <- nhdplusTools::get_flowline_index(nhdpv2_fline, - no_location, - search_radius = units::set_units( - search_radius_m, "m"), - max_matches = max_matches_in_radius) + # these are where we don't have a network drainage area match but NAWQA assigned a COMID + update_index <- is.na(dams$nhdpv2_REACH_measure) & !is.na(dams$nhdpv2_COMID) & + (!is.na(dams$norm_diffda) & abs(dams$norm_diffda) > da_diff_thresh) + if(any(update_index)) { + dams$index_type[update_index] <- "nawqa_off_network_da_mismatch" + } - linked_gages <- st_drop_geometry(select(no_location, provider_id)) %>% - mutate(id = seq_len(nrow(.))) %>% - left_join(new_hl, by = "id") %>% - left_join(select(st_drop_geometry(all_gages), - provider_id, drainage_area_sqkm), - by = "provider_id") %>% - left_join(v2_area, by = c("COMID" = "nhdpv2_COMID")) %>% - mutate(da_diff = abs(drainage_area_sqkm - nhdpv2_totdasqkm)) + # now look at everything where there is no prior COMID estimate + update_index <- which(is.na(dams$nhdpv2_COMID)) + + no_location <- dams[update_index, ] - linked_gages_dedup <- bind_rows( - linked_gages %>% + no_location <- sf::st_transform(no_location, 5070) + + linked_dams <- utility_link_dams(no_location, dams, nhdpv2_fline, vaa, + search_radius_m, max_matches_in_radius) %>% + filter(is.na(da_diff) | da_diff < da_diff_thresh) + + linked_dams_dedup <- bind_rows( + linked_dams %>% group_by(provider_id) %>% filter(is.na(da_diff)) %>% filter(offset == min(offset)) %>% - ungroup(), - linked_gages %>% + ungroup() %>% + mutate(index_type = "automatic_network_closest_flowline_no_da_check"), + linked_dams %>% group_by(provider_id) %>% filter(!is.na(da_diff)) %>% filter(da_diff == min(da_diff)) %>% - ungroup()) %>% + ungroup() %>% + mutate(index_type = "automatic_network_closest_drainage_area")) %>% group_by(provider_id) %>% + filter(hydroseq == min(hydroseq)) %>% filter(n() == 1) %>% ungroup() - linked_gages <- select(no_location, provider_id) %>% + linked_dams <- select(no_location, provider_id) %>% mutate(id = seq_len(nrow(.))) %>% - left_join(select(linked_gages_dedup, - id, COMID, REACHCODE, REACH_meas), + left_join(select(linked_dams_dedup, + id, COMID, REACHCODE, REACH_meas, index_type, drainage_area_sqkm_nhdpv2), by = "id") - all_gages$nhdpv2_REACHCODE[update_index] <- linked_gages$REACHCODE - all_gages$nhdpv2_REACH_measure[update_index] <- linked_gages$REACH_meas - all_gages$nhdpv2_COMID[update_index] <- linked_gages$COMID + dams$nhdpv2_REACHCODE[update_index] <- linked_dams$REACHCODE + dams$nhdpv2_REACH_measure[update_index] <- linked_dams$REACH_meas + dams$nhdpv2_COMID[update_index] <- linked_dams$COMID + dams$index_type[update_index] <- linked_dams$index_type + dams$drainage_area_sqkm_nhdpv2[update_index] <- linked_dams$drainage_area_sqkm_nhdpv2 + + dams <- select(dams, -norm_diffda) - all_gages + dams } -add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) { - mainstems <- mainstems[,c("id", "uri"), drop = TRUE] - mainstems$id <- as.integer(mainstems$id) - vaa <- right_join(vaa, mainstems, by = c("levelpathi" = "id")) - - vaa <- vaa[,c("comid", "uri")] - - names(vaa) <- c("comid", "mainstem_uri") +add_mainstems <- function(dam_hydrologic_locations, mainstems, vaa) { + mainstems <- mainstems[,c("head_nhdpv2_COMID", "uri"), drop = TRUE] + + mainstems$head_nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "", + mainstems$head_nhdpv2_COMID)) + + # stolen from ref_gages + mainstem_lookup <- group_by(vaa, levelpathi) |> + filter(hydroseq == max(hydroseq)) |> + ungroup() |> + select(head_nhdpv2_COMID = comid, levelpathi) |> + distinct() |> + left_join(mainstems, by = "head_nhdpv2_COMID") |> + filter(!is.na(uri)) |> + select(-head_nhdpv2_COMID) |> + right_join(select(vaa, comid, levelpathi), + by = "levelpathi") |> + select(-levelpathi, comid, mainstem_uri = uri) + + dplyr::left_join(dam_hydrologic_locations, mainstem_lookup, + by = c("nhdpv2_COMID" = "comid")) - left_join(gage_hydrologic_locations, vaa, - by = c("nhdpv2_COMID" = "comid")) } diff --git a/R/get_data.R b/R/get_data.R index 59d0132..339a332 100644 --- a/R/get_data.R +++ b/R/get_data.R @@ -42,3 +42,16 @@ get_nid_csv <- function(out_file = "data/nation.csv", readr::read_csv(out_file, skip = 1) } +get_all_mainstems <- function(outdir) { + url <- "https://www.hydroshare.org/resource/3cc04df349cd45f38e1637305c98529c/data/contents/mainstems.gpkg" + + dir.create(outdir, recursive = TRUE, showWarnings = FALSE) + + f <- file.path(outdir, basename(url)) + + if(!file.exists(f)) { + download.file(url, destfile = f, mode = "wb") + } + + sf::read_sf(f) +} diff --git a/R/write_out.R b/R/write_out.R index 73e0d28..f1b95fc 100644 --- a/R/write_out.R +++ b/R/write_out.R @@ -1,11 +1,26 @@ write_reference <- function(dam_locations, registry, providers, reference_file, nldi_file) { + if(is.numeric(dam_locations$nhdpv2_COMID)) dam_locations$nhdpv2_COMID <- + vapply(dam_locations$nhdpv2_COMID, \(x) { + if(is.na(x)) NA_character_ else paste0("https://geoconnex.us/nhdplusv2/comid/", x) + }, FUN.VALUE = c("")) + + if(all(is.na(dam_locations$nhdpv2_REACHCODE) | !grepl("https", dam_locations$nhdpv2_REACHCODE))) { + dam_locations$nhdpv2_REACHCODE_uri <- + vapply(dam_locations$nhdpv2_REACHCODE, \(x) { + if(is.na(x)) NA_character_ else paste0("https://geoconnex.us/nhdplusv2/reachcode/", x) + }, FUN.VALUE = c("")) + } + out <- dam_locations %>% mutate(identifier = paste0(provider, provider_id)) %>% left_join(select(convert_provider_id(registry, providers), uri, identifier, id), by = "identifier") %>% select(id, uri, name, description, subjectOf, - provider, provider_id, nhdpv2_COMID, feature_data_source) %>% + provider, provider_id, + nhdpv2_COMID, nhdpv2_REACHCODE_uri, nhdpv2_REACHCODE, nhdpv2_REACH_measure, + drainage_area_sqkm, drainage_area_sqkm_nhdpv2, index_type, + mainstem_uri, feature_data_source) %>% mutate(id = as.integer(id)) write_sf(out, reference_file) diff --git a/_targets.R b/_targets.R index 2d3e3e3..768fc6d 100644 --- a/_targets.R +++ b/_targets.R @@ -20,8 +20,8 @@ list( # Only the network flowlines for now -- non-network could be pulled in. tar_target("nhdpv2_fline", read_sf(nat_db, "NHDFlowline_Network")), tar_target("nhdpv2_fline_proc", select(st_transform(nhdpv2_fline, 5070), - COMID, REACHCODE, ToMeas, FromMeas)), - + COMID, REACHCODE, ToMeas, FromMeas, TotDASqKM)), + tar_target("mainstems", get_all_mainstems("data/mainstems/")), tar_target("dams", get_dam_data("data/dams/", sb = "5fb7e483d34eb413d5e14873", @@ -35,34 +35,28 @@ list( tar_target("nid", left_join(nid_gpkg, nid_meta, by = c("federalId" = "Federal ID"))), - - # this function filters and renames gage locations to a common table # there is a significant ammount of logic to prefer the older NAWQA-based QC # data source (dams) over the newer nid data source. See function internals # for details. tar_target("dam_locations", get_dam_locations(dams, nid)), - # TODO: Add specific hydrologic locations - # tar_target("vaa", get_vaa(atts = c("comid", "levelpathi", "hydroseq"), - # updated_network = TRUE)), - # # This function takes a table of all NWIS and more in the future gage - # # locations and a list of provided hydrologic locations. The provider - # # is a way to join on provider and provider_id in the all_gages input. - # # The order that hydrologic locations sources are provided will determine - # # precidence -- last defined wins. - # gage_hydrologic_locations = get_hydrologic_locations( - # all_gages = gage_locations, - # hydrologic_locations = list( - # list(provider = "https://waterdata.usgs.gov", - # locations = nwis_gage_hydro_locatons), - # list(provider = "https://cdec.water.ca.gov", - # locations = cdec_gage_address)), - # nhdpv2_fline = nhdpv2_fline_proc), - # - # gage_hydrologic_locations_with_mainstems = add_mainstems(gage_hydrologic_locations, - # mainstems, vaa), - # + + tar_target("vaa", get_vaa(atts = c("comid", "levelpathi", "hydroseq"), + updated_network = TRUE)), + + # This function takes a table of all dam locations + # and a list of provided hydrologic locations. The provider + # is a way to join on provider and provider_id in the all_gages input. + # The order that hydrologic locations sources are provided will determine + # precidence -- last defined wins. + tar_target("dam_hydrologic_locations", get_dam_hydrolocations( + dams = dam_locations, + nhdpv2_fline = sf::st_zm(nhdpv2_fline_proc), vaa = vaa)), + + tar_target("dam_hydrologic_locations_with_mainstems", + add_mainstems(dam_hydrologic_locations, + mainstems, vaa)), ### Registry ### # Each entry will have a provider and provider_id that acts as a unique # primary key. The existing registry file will have a unique attribute @@ -75,9 +69,9 @@ list( registry = "reg/ref_dams.csv", providers = providers)), - tar_target("reference_out", write_reference(dam_locations, + tar_target("reference_out", write_reference(dam_hydrologic_locations_with_mainstems, registry, providers, reference_file, nldi_file)), - tar_target("registry_out", write_registry(registry, "reg/ref_dams %>% .csv")) + tar_target("registry_out", write_registry(registry, "reg/ref_dams.csv")) )