This document describes the development of datasets for testing the CHASE assessment tool. By modifying the steps below or by using different sources of data, alternative datasets can be created for input to the CHASE assessment tool.
This could also serve as a model for development the actual assessment as well. However, in order to do this, a number of issues should be addressed, not limited to:
- Determination of the temporal confidence should to be included. At present random values are used.
- Determination of the methodological confidence should be included. This is also represented by random values at present.
For testing purposes, we used indicator data from the HOLAS II assessment. This data was available with indicators per station. The indicator data came from the file CHASEinput060318.xlsx sheet ‘4. BSII’. This sheet does not include station positions. These were added to the dataset by matching up station names to station positions on other sheets.
Using shape files for level 3 assessment units and level 4 assessment units, we mapped the station positions to assessment units, creating two input data sets for the CHASE assessment:
The shapes files for assessment units were downloaded from the HELCOM maps and data service (MADS).
Level 3 : https://maps.helcom.fi/website/MADS/download/?id=e5a59af9-c244-4069-9752-be3acc5dabed
Level 4: https://maps.helcom.fi/website/MADS/download/?id=67d653b1-aad1-4af4-920e-0683af3c4a48
The original shape files contain several columns which are not required. For the purpose of these tests, we are only interested in the names of the assessment units and area in km2. They were therefore modified using the following code, to reduce their size.
library(tidyverse)
library(sf)
units3 <- read_sf(dsn = "../gis/_ags_HELCOM_subbasins_with_coastal_and_offshore_division_2018_11",
layer = "HELCOM_subbasins_with_coastal_and_offshore_division_2018_1")
units3 <- units3 %>%
dplyr::select(HELCOM_ID,level_3,Area_km2=area_km2)
# note: change name of variable area_km2 for consistency
st_write(units3, "assessment_units/AssessmentUnits3.shp",append=F)
units4 <- read_sf(dsn = "../gis/_ags_HELCOM_subbasins_with_coastal_WFD_waterbodies_or_watertypes_2018_11",
layer = "HELCOM_subbasins_with_coastal_WFD_waterbodies_or_watertypes_2018_1")
units4 <- units4 %>%
dplyr::select(Name,HELCOM_ID,Area_km2)
st_write(units4, "assessment_units/AssessmentUnits4.shp",append=F)
Load packages
library(sf)
library(tidyverse)
library(patchwork) # not essential - this is used only to combine the two maps of assessment units
Load shape files for assessment units
units3 <- read_sf(dsn = "./assessment_units",
layer = "AssessmentUnits3")
units4 <- read_sf(dsn = "./assessment_units",
layer = "AssessmentUnits4")
# show maps of the assessment units
map3 <- ggplot() +
theme_minimal(base_size=9) +
ggtitle("Level 3 Assessment Units") +
geom_sf(data=units3, colour="black", fill=NA)
map4 <- ggplot() +
theme_minimal(base_size=9) +
ggtitle("Level 4 Assessment Units") +
geom_sf(data=units4, colour="black", fill=NA)
map3+map4
Read indicator data from text file input/bsii_by_stn.txt
file <- "./input/bsii_by_stn.txt"
df <- read.table(file,sep=";",
header=T,
fileEncoding="UTF-8",
comment.char="")
Convert indicator data frame to simple features
df_sf <- st_as_sf(df, coords = c("longitude", "latitude"), crs = 4326) # EPSG 4326 is WGS84
# transform the sf dataframe to the same projection as the assessment units
df_sf <- st_transform(df_sf,crs=st_crs(units3))
# plot the stations and the assessment units together
ggplot() +
theme_minimal(base_size=9) +
ggtitle("Level 4 Assessment Units \n+ Indicator positions") +
geom_sf(data=units4, colour="black", fill=NA) +
geom_sf(data=df_sf, colour="red")
Intersect the indicator data points with the polygons to get dataframes of indicators with added information about which assessment unit each indicator belongs to. There will be one dataframe for intersection with Level 3 assessment units and another dataframe for Level 4 assessment units.
As can be seen from the map, some station positions should be checked. This is outside the scope of this test.
df3 <- st_intersection(df_sf, units3)
df4 <- st_intersection(df_sf, units4)
# geometry information is no longer needed
df3$geometry <-NULL
df4$geometry <-NULL
# show the head for Level 3 data
print(head(df3))
## Station Matrix Substance Type CR Units Response ConfThresh
## 673 Hailuoto Biota HG HM 4.2861606 All + H
## 674 Hailuoto Biota SBD6 Org 218.8782490 All + H
## 675 Hailuoto Biota SCB6 Org 0.1348837 All + H
## 676 Hailuoto Biota CS-137 Rad 2.2830000 All + H
## 677 HAILUOTO Biota CS-137 Rad 1.5800000 All + H
## 711 Iso-Huituri Biota HG HM 11.9582607 All + H
## ConfStatus Datatype HELCOM_ID level_3 Area_km2
## 673 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
## 674 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
## 675 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
## 676 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
## 677 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
## 711 H All 1 Bothnian Bay Finnish Coastal waters 5548.123
Add counts of stations and observations
# count stations for Level3
stn_count3 <- df3 %>%
distinct(HELCOM_ID,level_3,Matrix,Substance,Station) %>%
group_by(HELCOM_ID,level_3,Matrix,Substance) %>%
summarise(CountStations=n()) %>%
ungroup()
# count observations for Level3
data_count3 <- df3 %>%
group_by(HELCOM_ID,level_3,Matrix,Substance) %>%
summarise(CountData=n()) %>%
ungroup()
# merge L3 counts back to original L3 data
df3 <- df3 %>%
left_join(stn_count3,by=c("HELCOM_ID","level_3","Matrix","Substance")) %>%
left_join(data_count3,by=c("HELCOM_ID","level_3","Matrix","Substance"))
# count stations for Level4
stn_count4 <- df4 %>%
distinct(HELCOM_ID,Name,Matrix,Substance,Station) %>%
group_by(HELCOM_ID,Name,Matrix,Substance) %>%
summarise(CountStations=n()) %>%
ungroup()
# count observations for Level4
data_count4 <- df4 %>%
group_by(HELCOM_ID,Name,Matrix,Substance) %>%
summarise(CountData=n()) %>%
ungroup()
# merge L4 counts back to original L4 data
df4 <- df4 %>%
left_join(stn_count4,by=c("HELCOM_ID","Name","Matrix","Substance")) %>%
left_join(data_count4,by=c("HELCOM_ID","Name","Matrix","Substance"))
Add spatial confidence based on km2 per sample
km2 per sample | Confidence |
---|---|
<500 | High |
500 - 5000 | Moderate |
> 5000 | Low |
km2perSampleBounds<-c(500,5000)
conf<-c("L","M","H")
df3 <- df3 %>%
mutate(km2perSample=Area_km2/CountData)
df3 <- df3 %>%
rowwise() %>%
mutate(ix=length(km2perSampleBounds[km2perSampleBounds>km2perSample])) %>%
mutate(ConfSpatial=conf[ix+1]) %>%
dplyr::select(-ix)
df4 <- df4 %>%
mutate(km2perSample=Area_km2/CountData)
df4 <- df4 %>%
rowwise() %>%
mutate(ix=length(km2perSampleBounds[km2perSampleBounds>km2perSample])) %>%
mutate(ConfSpatial=conf[ix+1]) %>%
dplyr::select(-ix)
Load table of confidences for threshold values from text file input/confidence_thresholds.txt
dfConfThreshold <- read.table("./input/confidence_thresholds.txt",sep=";",header=T)
print(dfConfThreshold)
## Substance Matrix ConfThresh Comment
## 1 ANT Sediment H
## 2 CD Sediment H
## 3 HBCD Sediment H
## 4 PB Sediment M
## 5 SBD6 Sediment H
## 6 TBTIN Sediment H
## 7 PB Biota M
## 8 HG Biota H
## 9 HBCD Biota H
## 10 FLU Biota H
## 11 BAP Biota H
## 12 CD Biota M
## 13 CS-137 Biota M
## 14 PFOS Biota H
## 15 SBD6 Biota H
## 16 SCB6 Biota L
## 17 SDX Biota H
## 18 CD Water H
## 19 CS-137 Water M
## 20 PB Water H
## 21 TBTIN Water H
## 22 VDS Bio. Effects M matrix changed to bio. effects CJM
## 23 PYR1OH Bio. Effects M added by CJM
## 24 PFOS Water H added by CJM
Original source for threshold confidence data is Table 2 in this meeting document: https://portal.helcom.fi/meetings/HOLAS%20II%20HZ%20WS%201-2018-518/MeetingDocuments/2-3%20Confidence%20setting%20for%20CHASE%20integrated%20assessment.pdf
Join threshold confidences to L3 and L4 indicator tables.
Note: the following code block is not run because ConfThresh is already included in the indicator data.
# NOT RUN!
df3 <- df3 %>%
left_join(dfConfThreshold,by=c("Substance","Matrix")) %>%
mutate(AU_scale=3)
df4 <- df4 %>%
left_join(dfConfThreshold,by=c("Substance","Matrix")) %>%
mutate(AU_scale=4)
Add random confidences for method to L3 and L4 indicator tables
conf<-c("L","M","H")
df3[,"ConfMethod"]<-round(runif(nrow(df3), min=0.5, max=3.49999))
df3 <- df3 %>%
rowwise() %>%
mutate(ConfMethod=conf[ConfMethod]) %>%
ungroup()
df4[,"ConfMethod"]<-round(runif(nrow(df4), min=0.5, max=3.49999))
df4 <- df4 %>%
rowwise() %>%
mutate(ConfMethod=conf[ConfMethod]) %>%
ungroup()
Add random temporal confidences to L3 and L4 indicator tables
conf<-c("L","M","H")
df3[,"ConfTemp"]<-round(runif(nrow(df3), min=0.5, max=3.49999))
df3 <- df3 %>%
rowwise() %>%
mutate(ConfTemp=conf[ConfTemp]) %>%
ungroup()
df4[,"ConfTemp"]<-round(runif(nrow(df4), min=0.5, max=3.49999))
df4 <- df4 %>%
rowwise() %>%
mutate(ConfTemp=conf[ConfTemp]) %>%
ungroup()
Select only columns needed
df3 <- df3 %>%
mutate(AU_scale=3) %>%
dplyr::select(AU_scale,AU=level_3,Area_km2,Substance,Type,Matrix,CR,
ConfThresh,CountStations,CountData,ConfSpatial,ConfMethod,ConfTemp)
df4 <- df4 %>%
mutate(AU_scale=4) %>%
dplyr::select(AU_scale,AU=HELCOM_ID,Area_km2,Substance,Type,Matrix,CR,
ConfThresh,CountStations,CountData,ConfSpatial,ConfMethod,ConfTemp)
Save the indicator data, including
- information on which assessment units they belong to
- threshold confidence [H/M/L]
- method confidence [H/M/L]
- number of stations in AU
- number of data points per km2
- spatial confidence [H/M/L]
- temporal confidence [H/M/L]
write.table(df3,file="./input/assessmentdata_L3.csv",sep=";",row.names=F,col.names=T,quote=T)
write.table(df4,file="./input/assessmentdata_L4.csv",sep=";",row.names=F,col.names=T,quote=T)
This markdown document is created by knitting the RMarkdown file test_dataset.Rmd. The steps here for generating test data can be run without generating markdown, using the standalone R script src/create_test_dataset.R
Updated 28-10-2021 [email protected]