Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
michalovadek authored Sep 10, 2020
1 parent 6f17d73 commit d6a4e3e
Show file tree
Hide file tree
Showing 46 changed files with 733 additions and 34,920 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: eurlex
Type: Package
Title: Retrieve Data on European Union Law
Version: 0.2.3
Version: 0.3.0
Authors@R: c(person(given = "Michal",
family = "Ovadek",
role = c("aut", "cre", "cph"),
Expand All @@ -22,7 +22,9 @@ Imports:
rvest,
rlang,
stringr,
readr
readr,
pdftools,
antiword
RoxygenNote: 7.1.0
Suggests:
knitr,
Expand Down
26 changes: 26 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# eurlex 0.3.0

## Major changes

- `elx_fetch_data("text")` now retrieves plain text from html, pdf and MS Word documents
- the type of source file is documented
- added handling of multiple files: all available text is retrieved and concatenated
- so far no support for images requiring OCR for text extraction for the sake of limiting dependencies and avoiding prolonging execution time

# eurlex 0.2.3

## Minor changes

- fixed serious bugs in `elx_curia_list()`

# eurlex 0.2.2

## Major changes

- `elx_council_votes()` made fully operational

# eurlex 0.2.1

## Minor changes

- optimization, reduce dependencies, switch from XML to xml2, etc.
2 changes: 1 addition & 1 deletion R/elx_curia_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' @export
#' @examples
#' \donttest{
#' elx_curia_list(data = "gc_all")
#' elx_curia_list(data = "cst_all")
#' }

elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all")){
Expand Down
127 changes: 121 additions & 6 deletions R/elx_fetch_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
#' Wraps httr::GET with pre-specified headers to retrieve data.
#'
#' @param url A valid url, preferably to a Cellar work obtained through `elx_run_query`.
#' @param type The type of data to be retrieved
#' @param type The type of data to be retrieved. When type = "text", the returned list contains named elements reflecting the source of each text.
#' @param language_1 The priority language in which the data will be attempted to be retrieved, in ISO 639 2-char code
#' @param language_2 If data not available in `language_1`, try `language_2`
#' @param language_3 If data not available in `language_2`, try `language_3`
#' @param include_breaks If TRUE, text includes tags showing where pages ("---pagebreak---", for pdfs) and documents ("---documentbreak---") were concatenated
#' @export
#' @examples
#' \donttest{
Expand All @@ -15,7 +16,8 @@
#' }

elx_fetch_data <- function(url, type = c("title","text","ids"),
language_1 = "en", language_2 = "fr", language_3 = "de"){
language_1 = "en", language_2 = "fr", language_3 = "de",
include_breaks = TRUE){

language <- paste(language_1,", ",language_2,";q=0.8, ",language_3,";q=0.7", sep = "")

Expand Down Expand Up @@ -44,17 +46,62 @@ elx_fetch_data <- function(url, type = c("title","text","ids"),
response <- httr::GET(url=url,
httr::add_headers('Accept-Language' = language,
'Content-Language' = language,
'Accept' = 'text/html, application/xhtml+xml'
'Accept' = 'text/html, text/html;type=simplified, text/plain, application/xhtml+xml, application/xhtml+xml;type=simplified, application/pdf, application/pdf;type=pdf1x, application/pdf;type=pdfa1a, application/pdf;type=pdfx, application/pdf;type=pdfa1b, application/msword'
)
)

if (httr::status_code(response)==200){

out <- response %>%
out <- elx_read_text(response)

}

else if (httr::status_code(response)==300){

links <- response %>%
httr::content(as = "text") %>%
xml2::read_html() %>%
rvest::html_text()
rvest::html_node("body") %>%
rvest::html_nodes("a") %>%
rvest::html_attrs() %>%
unlist()

} else {out <- httr::status_code(response)}
names(links) <- NULL

multiout <- ""

for (q in 1:length(links)){

multiresponse <- httr::GET(url=links[q],
httr::add_headers('Accept-Language' = language,
'Content-Language' = language,
'Accept' = 'text/html, text/html;type=simplified, text/plain, application/xhtml+xml, application/xhtml+xml;type=simplified, application/pdf, application/pdf;type=pdf1x, application/pdf;type=pdfa1a, application/pdf;type=pdfx, application/pdf;type=pdfa1b, application/msword'
)
)

if (httr::status_code(multiresponse)==200){

multiout[q] <- elx_read_text(multiresponse)

multiout <- paste0(multiout, collapse = " ---documentbreak--- ")

} else {multiout <- NA_character_}

}

names(multiout) <- "multidocs"

out <- multiout

}

else if (httr::status_code(response)==406){

out <- NA

names(out) <- "missingdoc"

}

}

Expand Down Expand Up @@ -93,11 +140,79 @@ elx_fetch_data <- function(url, type = c("title","text","ids"),

}

if (include_breaks == FALSE){

out <- out %>%
stringr::str_remove_all(out,"---pagebreak---|---documentbreak---")

}

return(out)

}


#' Read text from http response
#'
#' @importFrom rlang .data
#'
#' @noRd
#'

elx_read_text <- function(http_response){

if (stringr::str_detect(http_response$headers$`content-type`,"html")){

out <- http_response %>%
xml2::read_html() %>%
rvest::html_node("body") %>%
rvest::html_text() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "html"

}

else if (stringr::str_detect(http_response$headers$`content-type`,"pdf")){

out <- http_response$url %>%
pdftools::pdf_text() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "pdf"

}

else if (stringr::str_detect(http_response$headers$`content-type`,"msword")){

out <- http_response$url %>%
antiword::antiword() %>%
paste0(collapse = " ---pagebreak--- ")

names(out) <- "word"

} else {
out <- "unsupported format"
names(out) <- "unsupported"
}

return(out)

}

# testing
# dat <- eurlex::elx_make_query("proposal") %>% eurlex::elx_run_query()
#
# smpl <- sample_n(dat,100)
#
# new <- map(smpl$work,possibly(elx_fetch_data, otherwise = NA_character_),"text")
# smpl[which(map((new),function(x) sum(is.na(x)))==1),3] %>% print(n=100)
#
# unlist(new) %>% enframe() %>% mutate(celex = smpl$celex) %>% filter(name=="multidocs") %>% print(n=150)
#
# multi_url <- smpl[43,1] %>% deframe()
#
# for (i in 150:nrow(smpl)){
# elx_fetch_data(smpl$work[i],"text")
# }

2 changes: 1 addition & 1 deletion doc/eurlexpkg.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Now that we have a query, we are ready to run it.

## `elx_run_query()`: Execute SPARQL queries

`elx_run_query()` uses `RCurl` to send SPARQL queries to a pre-specified endpoint. The function takes the query string as the main argument, which means you can manually pass it any working SPARQL query (relevant to official EU publications).
`elx_run_query()` sends SPARQL queries to a pre-specified endpoint. The function takes the query string as the main argument, which means you can manually pass it any working SPARQL query (relevant to official EU publications).

```{r runquery, eval=FALSE}
results <- elx_run_query(query = query_dir)
Expand Down
10 changes: 5 additions & 5 deletions doc/eurlexpkg.html

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion docs/404.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added docs/apple-touch-icon-120x120.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/apple-touch-icon-152x152.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/apple-touch-icon-180x180.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/apple-touch-icon-60x60.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/apple-touch-icon-76x76.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/apple-touch-icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 11 additions & 4 deletions docs/articles/eurlexpkg.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified docs/articles/eurlexpkg_files/figure-html/firstplot-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 9 additions & 1 deletion docs/articles/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion docs/authors.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added docs/favicon-16x16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/favicon-32x32.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/favicon.ico
Binary file not shown.
Loading

0 comments on commit d6a4e3e

Please sign in to comment.