This repository has been archived by the owner on May 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadin_herrings_from_web.R
147 lines (107 loc) · 4.86 KB
/
readin_herrings_from_web.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
###########################################################
###########################################################
# This R script pulls in other quota monitoring from the GARFO website.
# It does minimal data cleaning and saves the results in .Rdata and .dta format.
# It currently pulls in herring, haddock catch cap, River Herring Shad in both herring
# and mackerel fisheries.
# It is mostly "list" friendly -- all you need to do is add locations of the html files.
###########################################################
###########################################################
rm(list=ls())
library(rvest)
library(plyr)
library(foreign)
library(data.table)
YOUR.PROJECT.PATH<-"/home/mlee/Documents/projects/scraper/"
YOUR.OUTPUT.PATH<-"/home/mlee/Documents/projects/scraper/daily_data_out/herrings"
setwd(YOUR.PROJECT.PATH)
# For each table you want to download and store, you need to name of html file, and location on the interweb
# I had to split this up into the prefix (GARFO), folder (GARFO.FOLDER), and dataset.names (without the .html) extension. It's a little hinky.
GARFO<-c("https://www.greateratlantic.fisheries.noaa.gov/ro/fso/reports/")
GARFO.FOLDER<-c("herring/","HaddockBycatchReport/","Herring_RHS/","Mackerel_RHS/")
dataset.names<-c("qm_herring","qm_haddock_catch_caps","qm_herring_rhs_catch_caps", "qm_mackerel_rhs_catch_caps")
dataset.names.ext<-paste0(dataset.names,".Rdata")
###########################################################
# You shouldn't need to edit anything below this line.
###########################################################
tables.to.parse<-paste0(GARFO,GARFO.FOLDER,dataset.names,".html")
storage.locations<-file.path(YOUR.OUTPUT.PATH,dataset.names.ext)
#cast to lists
tables.to.parse<-as.list(tables.to.parse)
storage.locations<-as.list(storage.locations)
dataset.names<-as.list(dataset.names)
dataset.names.ext<-as.list(dataset.names.ext)
# Do these files exist? If so, then do nothing. If not, then create a null R.data frame to hold stuff.
test.exist<- function(check.these,df.names) {
z<-which(list.files(YOUR.OUTPUT.PATH) == df.names)
first_time<-length(z)<1
{if (first_time==TRUE){
empty<-data.frame()
name<-paste(df.names)
assign(name, empty)
save(list=name, file=check.these)
}
}
}
mapply(test.exist,storage.locations, dataset.names.ext)
#############################################
############### Define some functions. ###############
#############################################
# read.in.combine function reads in the GARFO quota monitoring tables, parses it, and sticks it into a data frame.
qy_pattern = '<u>Quota Year:</u> <strong>'
run_pattern = '<u>Report Run on:</u> ([^<]*) <br> <u>'
read.in.combine <- function(mytable) {
file<-read_html(mytable)
tables<-html_nodes(file, "table")
myresults <- html_table(tables[1], fill = TRUE)[[1]]
thepage = readLines(mytable)
run_lines = grep(run_pattern,thepage,value=TRUE)
report_date<-strsplit(run_lines,"<br>")[[1]][1]
report_date<-trimws(gsub("<u>Report Run on:</u>","",report_date),which=c("both"))
qy_lines = grep(qy_pattern,thepage,value=TRUE)
quota_period<-strsplit(qy_lines,"<em>")[[1]][1]
quota_period<-strsplit(quota_period,"<u>")[[1]][3]
quota_period<-gsub("Quota Year:</u> <strong>","",quota_period)
quota_period<-gsub("</strong>","",quota_period)
quota_period<-trimws(quota_period,which=c("both"))
myresults<-cbind(myresults,report_date, quota_period)
myresults$report_date<-as.Date(myresults$report_date,"%Y-%m-%d")
names(myresults)<-tolower(gsub("[^[:alnum:]]","",names(myresults)))
myresults
}
#############################################
###########Actually do some stuff############
#############################################
myclean<-lapply(tables.to.parse,read.in.combine)
#without unlist, mget(load(x)) puts my dataframes into a list of lists. The unlist with recursive=false 'flattens' one level of listing
my.old.data<-unlist(lapply(storage.locations,function(x) mget(load(x))), recursive=FALSE)
# I should lapply this, but I'm sick of this. I'm writing a loop.
# assert that myclean and my.old.data are the same length
len.clean<-length(myclean)
len.old<-length(my.old.data)
len.clean==len.old
out_data<-NULL
# dataset.names contains the desired data frame names
# assign(paste(dataset.names[[i]]) to something?
for (i in 1:len.clean) {
#Rbind the new and old together
{ if (nrow(my.old.data[[i]])==0 ) {
temp<-myclean[[i]]
}
else{
temp<-rbind(my.old.data[[i]],myclean[[i]])
}
}
#Strip out duplicates
temp<-unique(temp)
#stick it into the list, just in case
out_data[[i]]<-temp
# assign the values of temp to a new dataframe
name<-paste(dataset.names[[i]])
assign(name, temp)
save(list=name, file=storage.locations[[i]])
write.dta(temp, file.path(YOUR.OUTPUT.PATH,paste0(dataset.names[[i]],".dta")))
#shouldn't be necessary, but just in case
rm(temp)
rm(name)
}