Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
victorcazalis authored Jun 21, 2021
1 parent 1b21cca commit 55edf14
Show file tree
Hide file tree
Showing 22 changed files with 3,158 additions and 0 deletions.
92 changes: 92 additions & 0 deletions HFI.01.Data.export.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

setwd(paste0(GBrow, "0.Data/0.Raw.data/", QQ))

library(auk) ; library(plyr) ; library(readr) ; library(nlme)
`%not in%` <- function (x, table) is.na(match(x, table, nomatch=NA_integer_))



###################
### Charge data ###
###################
files.countries<-list.files(getwd())


###############################################
### EXPORT WITH EXPERTISE RESTRICTIONS ONLY ###
###############################################

### Loop for all countries
for (i in 1:length(files.countries)){

eb<-read_ebd(files.countries[i], rollup=T, unique=F) # I want to rollup but not to remove double counts


### Reduce dataset
# Remove useless columns
eb[,c("state_province", "subnational1_code","subspecies_scientific_name","state","last_edited_date", "subspecies_common_name", "breeding_bird_atlas_category", "effort_area_ha", "county", "subnational2_code", "iba_code", "bcr_code", "usfws_code", "atlas_block", "locality", "locality_id", "locality_type", "first_name", "last_name", "has_media", "reviewed", "x")]<-NULL
# Remove disapproved observations
eb<-subset(eb, eb$approved==T)
# Remove checklist that did not mention every observation
eb<-subset(eb, eb$all_species_reported==T)
# Remove domestic species (only Columba livia)
eb<-subset(eb, eb$category != "domestic")
# Remove old observation
eb$year<-as.numeric(format(eb$observation_date, "%Y"))
eb<-subset(eb, eb$year>=2010)

### Format dates
eb$year<-as.numeric(format(eb$observation_date, "%Y"))
eb$month<-as.numeric(format(eb$observation_date, "%m"))
eb$day<-as.numeric(format(eb$observation_date, "%j"))


### Assign if I can keep the checklist for abundance data (i.e. protocol I like with duration information, distance if Traveling or historical protocols, duplicates removed by auk)

### eb2 will be used for abundance/occurence analyses: use sampling_event for expertise and checklist_id for abundance/occurence analyses
eb2<-auk_unique(eb)
eb2<-subset(eb2, eb2$protocol_type %in% c("Traveling","Stationary","Historical") & is.na(eb2$duration_minutes)==FALSE)
eb2<-subset(eb2, eb2$protocol_type=="Stationary" | is.na(eb2$effort_distance_km)==FALSE)


# Provide name files that is different for CA and US
if(substr(files.countries[i], nchar(files.countries[i])-20, nchar(files.countries[i])-19)=="US"){NAME<-paste0(substr(files.countries[i], nchar(files.countries[i])-17, nchar(files.countries[i])-16), "_US")}
if(substr(files.countries[i], nchar(files.countries[i])-20, nchar(files.countries[i])-19)=="CA"){NAME<-paste0(substr(files.countries[i], nchar(files.countries[i])-17, nchar(files.countries[i])-16), "_CA")}
if(substr(files.countries[i], nchar(files.countries[i])-20, nchar(files.countries[i])-19) %not in% c("CA", "US")){NAME<-substr(files.countries[i], nchar(files.countries[i])-17, nchar(files.countries[i])-16)}

saveRDS(eb, file=paste0(GBrow, "0.Data/1.Exported4Expertise/", QQ, "/Exported4Expertise.", NAME, ".rds"))
saveRDS(eb2, file=paste0(GBrow, "0.Data/1.Exported4Analyses/", QQ, "/Exported4Analyses.", NAME, ".rds"))

cat(i)

}




#####################
### MERGING FILES ###
#####################

### Merge for expertise in script 02

### Merge files for analyses
setwd(paste0(GBrow, "0.Data/1.Exported4Analyses/", QQ))

df<-readRDS(list.files()[1]) # Stock the first data frame in df

for(i in 2:length(list.files())){
df<-rbind(df, readRDS(list.files()[i])) # Add every table in df
}


library(data.table)
fwrite(df, paste0(GBrow, "0.Data/1.Exported4Analyses/0.Exported.eBird.dataTABLE.HFI.", QQ, ".csv"), row.names=FALSE)








82 changes: 82 additions & 0 deletions HFI.02A.Expertise.calculation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@


##############################
### CREATE EXPERTISE FILES ###
##############################

### Summarise chlist and observers per files and merge per QQ
setwd(paste0(GBrow, "0.Data/1.Exported4Expertise/", QQ))

Files<-list.files()

chlistSTOCK<-list()
observerSTOCK<-list()

for(i in 1:length(Files)){
df<-readRDS(Files[i])


### CHECKLIST TABLE: I could change it to use data.table (lots of time won for few coding)
chlist<-ddply(df, .(df$sampling_event_identifier), function(x){data.frame(
date=x$observation_date[1],
time=x$time_observations_started[1],
distance=x$effort_distance_km[1],
n.observers=x$number_observers[1],
duration=x$duration_minutes[1],
observer=x$observer_id[1],
rich=nrow(x),
lon=x$longitude[1],
lat=x$latitude[1],
protocol=x$protocol_type[1],
year=x$year[1]
)})


names(chlist)<-replace(names(chlist), names(chlist)=="df$sampling_event_identifier", "checklist")

chlist$day<-as.numeric(format(as.Date(chlist$date, "%d/%m/%Yd"), "%j"))

chlist$time.min<-sapply(strsplit(as.character(chlist$time),":"),
function(x) {
x <- as.numeric(x)
x[1]*60+x[2] })

chlistSTOCK[[i]]<-chlist


### OBSERVER TABLE
observer.var<-ddply(df, .(df$observer_id), function(x){data.frame(Nb_obs=nrow(x),
Nb_spc=nlevels(droplevels(as.factor(x$scientific_name))),
Nb_checklist=nlevels(droplevels(as.factor(x$sampling_event_identifier))),
Species=paste(unique(x$taxonomic_order), collapse=";")
)})
names(observer.var)[1]<-"observer_id"

observerSTOCK[[i]]<-observer.var

cat(i)

}




# Merge chlist per QQ
txt<-"chlistMERGE<-rbind(chlistSTOCK[[1]], chlistSTOCK[[2]]"
for(T in 3:length(Files)){txt<-paste0(txt, ", chlistSTOCK[[", T, "]]")}
txt<-paste0(txt, ")")

eval(parse(text=txt))

saveRDS(chlistMERGE, paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.", QQ, "merged.rds"))

# Merge observer per QQ
txt<-"observerMERGE<-rbind(observerSTOCK[[1]], observerSTOCK[[2]]"
for(T in 3:length(Files)){txt<-paste0(txt, ", observerSTOCK[[", T, "]]")}
txt<-paste0(txt, ")")

eval(parse(text=txt))

saveRDS(observerMERGE, paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.", QQ, "merged.rds"))


174 changes: 174 additions & 0 deletions HFI.02B.Expertise.calculation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@

### Charge all checklist files and merge
chlist1<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q1merged.rds"))
chlist2<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q2merged.rds"))
chlist3<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q3merged.rds"))
chlist4<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q4merged.rds"))
chlist5<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q5merged.rds"))
chlist6<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q6merged.rds"))
chlist7<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/chlist.expertise.Q7merged.rds"))

chlist<-rbind(chlist1, chlist2, chlist3, chlist4, chlist5, chlist6, chlist7)


### Charge all observer files and merge
observer1<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q1merged.rds"))
observer2<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q2merged.rds"))
observer3<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q3merged.rds"))
observer4<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q4merged.rds"))
observer5<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q5merged.rds"))
observer6<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q6merged.rds"))
observer7<-readRDS(paste0(GBrow, "0.Data/1.Exported4Expertise/Qmerged/observer.expertise.Q7merged.rds"))
observer<-rbind(observer1, observer2, observer3, observer4, observer5, observer6, observer7)

### For each observer, calculate the number of checklists, observations and species
obs<-ddply(observer, .(observer_id), function(x){data.frame(
Nb_checklist=sum(x$Nb_checklist),
Nb_obs=sum(x$Nb_obs),
Nb_spc=length(unique(unlist(strsplit(as.character(x$Species), ";")))),
Species=paste0(unique(unlist(strsplit(as.character(x$Species), ";"))), collapse=";")
)})


### Remove unexperienced observers
tapply(obs$Nb_checklist, obs$Nb_checklist>=50 & obs$Nb_spc>=100, sum) # Number of checklist I'll remove
tapply(obs$Nb_checklist, obs$Nb_checklist>=50 & obs$Nb_spc>=100, length) # Number of observers I'll remove

obs.to.exclude<-subset(obs, obs$Nb_spc<100 | obs$Nb_checklist<50)
chlist2<-subset(chlist, chlist$observer %not in% obs.to.exclude$observer_id)
chlist2$observer<-droplevels(chlist2$observer)



# Remove NAs
cat(paste(100*round(table(is.na(chlist2$duration))["TRUE"]/nrow(chlist2),2), " % of duration values are NA"))
chlist2<-chlist2[,c("checklist", "rich", "protocol", "duration", "n.observers", "time.min", "lon", "lat", "day", "observer")]
chlist3<-chlist2[complete.cases(chlist2),]
chlist3$observer<-droplevels(chlist3$observer)



#################################
### MAKE MODELS FOR EXPERTISE ###
#################################


### Modele in two steps
# GAM (for big datasets)
mod.fix<-mgcv::bam(
rich ~ protocol + n.observers + s(duration) + s(time.min) + te(lon, lat, day),
data=chlist3,
family="nb"
)

# GAM (for big datasets)
chlist3$Residus<-residuals(mod.fix)
mod.random<-nlme::lme(
Residus ~ 1, random =~1|observer,
data=chlist3
)



### Predict
dfExp<-data.frame(
protocol="Stationary",
n.observers=median(chlist3$n.observers, na.rm=T),
duration=median(chlist3$duration, na.rm=T),
time.min=median(chlist3$time.min, na.rm=T),
lon=median(chlist3$lon, na.rm=T),
lat=median(chlist3$lat, na.rm=T),
day=median(chlist3$day, na.rm=T)
)

# Extract fixed effect prediction (unique value)
Pred.fix<-predict(mod.fix, newdata=dfExp)

# Add random effects
Pred.obs<-as.data.frame(nlme::ranef(mod.random))
names(Pred.obs)[1]<-"Pred"
Pred.obs$Pred<-Pred.obs$Pred + as.numeric(Pred.fix)


### SAVE THE EXPERTISE SCORE
obs$obsKelling<-Pred.obs$Pred[match(obs$observer_id, rownames(Pred.obs))]

saveRDS(obs, paste0(GBrow, "1.Tables/Expertise.scores.table.rds"))





############################################
### Graphics of fit + covariates effects ###
############################################
pdf(paste0("D:/eBird/HFI.project/Figures/Check.figures/Obs.expertise.HFI1.pdf"))

hist(chlist3$rich, breaks=50, main="Richness distribution (check Poisson distribution)")

hist(Pred.obs$Pred, breaks=30, xlab="Kelling observer expertise score", main="Expertise score distribution")

par(mfrow=c(2,2)) ; mgcv::gam.check(mod.fix) ; par(mfrow=c(1,1))


### Covariates effects
# Duration
ndf.dur<-data.frame(duration=c(min(chlist3$duration):max(chlist3$duration)), protocol="Stationary", n.observers=median(chlist3$n.observers, na.rm=T), time.min=median(chlist3$time.min, na.rm=T), day=median(chlist3$day, na.rm=T), lat=median(chlist3$lat, na.rm=T), lon=median(chlist3$lon, na.rm=T))
ndf.dur[,8:9]<-predict(mod.fix, ndf.dur, se.fit=T)
ndf.dur$min<-ndf.dur$fit-1.96*ndf.dur$se.fit
ndf.dur$max<-ndf.dur$fit+1.96*ndf.dur$se.fit
for(i in c(8,10,11)){ndf.dur[,i]<-exp(ndf.dur[,i])}
ggplot(ndf.dur)+
geom_line(aes(x=duration, y=fit))+
geom_line(aes(x=duration, y=min), linetype="dashed")+
geom_line(aes(x=duration, y=max), linetype="dashed")+
ggtitle("Model covariates: duration")+
scale_y_continuous(limits=c(0, 2*max(ndf.dur$fit, na.rm=T)))

# Time.min
ndf.time<-data.frame(time.min=c(min(chlist3$time.min):max(chlist3$time.min)), protocol="Stationary", n.observers=median(chlist3$n.observers, na.rm=T), duration=median(chlist3$duration, na.rm=T), day=median(chlist3$day, na.rm=T), lat=median(chlist3$lat, na.rm=T), lon=median(chlist3$lon, na.rm=T))
ndf.time[,8:9]<-predict(mod.fix, ndf.time, se.fit=T)
ndf.time$min<-ndf.time$fit-1.96*ndf.time$se.fit
ndf.time$max<-ndf.time$fit+1.96*ndf.time$se.fit
for(i in c(8,10,11)){ndf.time[,i]<-exp(ndf.time[,i])}
ggplot(ndf.time)+
geom_line(aes(x=time.min, y=fit))+
geom_line(aes(x=time.min, y=min), linetype="dashed")+
geom_line(aes(x=time.min, y=max), linetype="dashed")+
ggtitle("Model covariates: Starting time")+
scale_y_continuous(limits=c(0, 2*max(ndf.time$fit, na.rm=T)))

# n.observers
ndf.nobs<-data.frame(n.observers=c(min(chlist3$n.observers):max(chlist3$n.observers)), protocol="Stationary", duration=median(chlist3$duration, na.rm=T), time.min=median(chlist3$time.min, na.rm=T), day=median(chlist3$day, na.rm=T), lat=median(chlist3$lat, na.rm=T), lon=median(chlist3$lon, na.rm=T))
ndf.nobs[,8:9]<-predict(mod.fix, ndf.nobs, se.fit=T)
ndf.nobs$min<-ndf.nobs$fit-1.96*ndf.nobs$se.fit
ndf.nobs$max<-ndf.nobs$fit+1.96*ndf.nobs$se.fit
for(i in c(8,10,11)){ndf.nobs[,i]<-exp(ndf.nobs[,i])}
ggplot(ndf.nobs)+
geom_line(aes(x=n.observers, y=fit))+
geom_line(aes(x=n.observers, y=min), linetype="dashed")+
geom_line(aes(x=n.observers, y=max), linetype="dashed")+
ggtitle("Model covariates: Number of observers")+
scale_y_continuous(limits=c(0, 2*max(ndf.nobs$fit, na.rm=T)))

# Day map
ndf.map<-as.data.frame(expand.grid(rowN=rownames(chlist3), PA="unPA", day=c(15,74,135,196,258,319), duration=median(chlist3$duration), protocol="Stationary", time.min=median(chlist3$time.min, na.rm=T), n.observers=median(chlist3$n.observers, na.rm=T)))
ndf.map$lon<-chlist3[match(ndf.map$rowN, rownames(chlist3)), "lon"]
ndf.map$lat<-chlist3[match(ndf.map$rowN, rownames(chlist3)), "lat"]
ndf.map$fit<-predict(mod.fix, ndf.map)
ndf.map$fit<-(ndf.map$fit)
ndf.map$day<-revalue(as.factor(ndf.map$day), c("15"="January","74"="March","135"="May","196"="July","258"="September","319"="November"))

ggplot(ndf.map)+
stat_summary_hex(aes(x=lon, y=lat, z=DescTools::Winsorize(fit)))+
ggtitle("Model covariates: Spatio-temporal variations")+
facet_wrap(~day)


dev.off()






Loading

0 comments on commit 55edf14

Please sign in to comment.