From 225684dbcf01e8c8fee5f32e4e6b81d867cd2f99 Mon Sep 17 00:00:00 2001 From: michalovadek Date: Wed, 6 Sep 2023 18:00:09 +0100 Subject: [PATCH] 046 cran fix --- .Rbuildignore | 1 + DESCRIPTION | 2 +- NEWS.md | 4 + R/elx_council_votes.R | 25 +- R/elx_curia_list.R | 20 +- R/elx_fetch_data.R | 101 ++-- R/elx_run_query.R | 107 ++-- doc/eurlexpkg.html | 2 +- docs/404.html | 7 +- docs/articles/council.html | 22 +- docs/articles/eurlexpkg.html | 7 +- .../figure-html/wordcloud-1.png | Bin 96541 -> 90470 bytes docs/articles/index.html | 11 +- docs/articles/sparql-queries.html | 351 ++++++++++++ docs/authors.html | 7 +- docs/index.html | 7 +- docs/news/index.html | 10 +- docs/pkgdown.yml | 3 +- docs/reference/elx_council_votes.html | 7 +- docs/reference/elx_curia_list.html | 7 +- docs/reference/elx_download_xml.html | 7 +- docs/reference/elx_fetch_data.html | 11 +- docs/reference/elx_label_eurovoc.html | 7 +- docs/reference/elx_make_query.html | 7 +- docs/reference/elx_run_query.html | 7 +- docs/reference/index.html | 7 +- docs/sitemap.xml | 3 + man/elx_fetch_data.Rd | 2 +- tests/testthat/test-fetch.R | 2 +- tests/testthat/test-query.R | 2 +- vignettes/{ => articles}/council.Rmd | 11 +- vignettes/{ => articles}/eurlexpkg.Rmd | 534 +++++++++--------- vignettes/sparql-queries.Rmd | 101 ++++ 33 files changed, 983 insertions(+), 419 deletions(-) create mode 100644 docs/articles/sparql-queries.html rename vignettes/{ => articles}/council.Rmd (95%) rename vignettes/{ => articles}/eurlexpkg.Rmd (98%) create mode 100644 vignettes/sparql-queries.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index 03a7327..d03a976 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^Meta$ ^CRAN-SUBMISSION$ ^cran-comments\.md$ +^vignettes/articles$ diff --git a/DESCRIPTION b/DESCRIPTION index 3a4dadc..03764e0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: eurlex Type: Package Title: Retrieve Data on European Union Law -Version: 0.4.5 +Version: 0.4.6 Authors@R: c(person(given = "Michal", family = "Ovadek", role = c("aut", "cre", "cph"), diff --git a/NEWS.md b/NEWS.md index eec75d0..466515a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,10 @@ ## Minor changes - minor changes to documentation +- cleaned up http calls code +- calls to `elx_council_votes()` and `elx_curia_list()` now fail gracefully +- .data replaced by quoted variables for tidyselect functions +- Internet-using vignettes moved to site-only articles # eurlex 0.4.5 diff --git a/R/elx_council_votes.R b/R/elx_council_votes.R index 7fff72f..44d3784 100644 --- a/R/elx_council_votes.R +++ b/R/elx_council_votes.R @@ -89,13 +89,30 @@ elx_council_votes <- function(){ } ORDER BY DESC(?decisionDate), ?votingInstCode " - - votes <- httr::POST(url = "https://data.consilium.europa.eu/sparql", - body = list(query = query), - httr::add_headers('Accept' = 'text/csv')) %>% + + # run query + votes_resp <- graceful_http( + remote_file = "https://data.consilium.europa.eu/sparql", + body = list(query = query), + httr::content_type("multipart"), + headers = httr::add_headers('Accept' = 'text/csv'), + encode = "multipart", + verb = "POST" + ) + + # if var not created, break + if (is.null(votes_resp)){ + + return(invisible(NULL)) + + } + + # process response + votes <- votes_resp %>% httr::content("text") %>% readr::read_csv(col_types = readr::cols(.default = "c")) + # return return(votes) } diff --git a/R/elx_curia_list.R b/R/elx_curia_list.R index c53cc5d..4765282 100644 --- a/R/elx_curia_list.R +++ b/R/elx_curia_list.R @@ -101,7 +101,16 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" elx_curia_scraper <- function(url, ...){ - page <- xml2::read_html(url(url, open = "rb")) + response <- graceful_http(url, verb = "GET") + + # if var not created, break + if (is.null(response)){ + + return(invisible(NULL)) + + } + + page <- xml2::read_html(response) tab <- page %>% rvest::html_node("table") %>% @@ -131,7 +140,7 @@ elx_curia_scraper <- function(url, ...){ dplyr::ungroup() out <- dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>% - dplyr::select(.data$case_id, .data$linked_celex, .data$case_info) %>% + dplyr::select("case_id", "linked_celex", "case_info") %>% dplyr::rename(case_id_celex = linked_celex) return(out) @@ -152,16 +161,13 @@ elx_curia_parse <- function(x, ...){ see_case = stringr::str_extract(.data$case_info, "see Case .+") %>% stringr::str_remove("see Case ") %>% stringr::str_remove("APPEAL.*") %>% - stringr::str_squish() %>% - stringr::str_trim(), + stringr::str_squish(), appeal = stringr::str_extract(.data$case_info, "APPEAL.*") %>% stringr::str_remove("APPEAL.? :") %>% stringr::str_remove_all("\\;|\\,|\\.") %>% - stringr::str_squish() %>% - stringr::str_trim() + stringr::str_squish() ) return(out) } - diff --git a/R/elx_fetch_data.R b/R/elx_fetch_data.R index c3c9fb4..ba2403c 100644 --- a/R/elx_fetch_data.R +++ b/R/elx_fetch_data.R @@ -1,6 +1,6 @@ #' Retrieve additional data on EU documents #' -#' Wraps httr::GET with pre-specified headers and parses retrieved data. +#' Get titles, texts, identifiers and XML notices for EU resources. #' #' @param url A valid url as character vector of length one based on a resource identifier such as CELEX or Cellar URI. #' @param type The type of data to be retrieved. When type = "text", the returned list contains named elements reflecting the source of each text. When type = "notice", the results return an XML notice associated with the url. @@ -38,8 +38,10 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), if (type == "notice" & missing(notice)){stop("notice type must be given")} + # format language query language <- paste(language_1,", ",language_2,";q=0.8, ",language_3,";q=0.7", sep = "") + # process URL if (stringr::str_detect(url,"celex.*[\\(|\\)|\\/]")){ clx <- stringr::str_extract(url, "(?<=celex\\/).*") %>% @@ -53,12 +55,20 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # titles if (type == "title"){ response <- graceful_http(url, headers = httr::add_headers('Accept-Language' = language, 'Accept' = 'application/xml; notice=object'), verb = "GET") + + # if var not created, break + if (is.null(response)){ + + return(invisible(NULL)) + + } if (httr::status_code(response)==200){ @@ -71,6 +81,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # full text if (type == "text"){ response <- graceful_http(url, @@ -79,6 +90,13 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), 'Accept' = 'text/html, text/html;type=simplified, text/plain, application/xhtml+xml, application/xhtml+xml;type=simplified, application/pdf, application/pdf;type=pdf1x, application/pdf;type=pdfa1a, application/pdf;type=pdfx, application/pdf;type=pdfa1b, application/msword'), verb = "GET") + # if var not created, break + if (is.null(response)){ + + return(invisible(NULL)) + + } + if (httr::status_code(response)==200){ out <- elx_read_text(response, html_text = html_text) @@ -142,12 +160,20 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # identifiers if (type == "ids"){ response <- graceful_http(url, headers = httr::add_headers('Accept-Language' = language, 'Accept' = 'application/xml; notice=identifiers'), verb = "GET") + + # if var not created, break + if (is.null(response)){ + + return(invisible(NULL)) + + } if (httr::status_code(response)==200){ @@ -160,6 +186,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # notices if (type == "notice"){ accept_header <- paste('application/xml; notice=', @@ -184,6 +211,13 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # if var not created, break + if (is.null(response)){ + + return(invisible(NULL)) + + } + if (httr::status_code(response)==200){ out <- httr::content(response) @@ -192,6 +226,7 @@ elx_fetch_data <- function(url, type = c("title","text","ids","notice"), } + # end return(out) } @@ -225,40 +260,40 @@ elx_read_text <- function(http_response, html_text = "text2"){ } - if (stringr::str_detect(http_response$headers$`content-type`,"html")){ - - out <- http_response %>% - xml2::read_html() %>% - rvest::html_node("body") %>% - html_text_engine() %>% - paste0(collapse = " ---pagebreak--- ") - - names(out) <- "html" - - } - - else if (stringr::str_detect(http_response$headers$`content-type`,"pdf")){ - - out <- http_response$url %>% - pdftools::pdf_text() %>% - paste0(collapse = " ---pagebreak--- ") - - names(out) <- "pdf" - - } - - else if (stringr::str_detect(http_response$headers$`content-type`,"msword")){ - - out <- http_response$url %>% - antiword::antiword() %>% - paste0(collapse = " ---pagebreak--- ") + if (stringr::str_detect(http_response$headers$`content-type`,"html")){ + + out <- http_response %>% + xml2::read_html() %>% + rvest::html_node("body") %>% + html_text_engine() %>% + paste0(collapse = " ---pagebreak--- ") + + names(out) <- "html" + + } - names(out) <- "word" + else if (stringr::str_detect(http_response$headers$`content-type`,"pdf")){ + + out <- http_response$url %>% + pdftools::pdf_text() %>% + paste0(collapse = " ---pagebreak--- ") + + names(out) <- "pdf" + + } - } else { - out <- "unsupported format" - names(out) <- "unsupported" - } + else if (stringr::str_detect(http_response$headers$`content-type`,"msword")){ + + out <- http_response$url %>% + antiword::antiword() %>% + paste0(collapse = " ---pagebreak--- ") + + names(out) <- "word" + + } else { + out <- "unsupported format" + names(out) <- "unsupported" + } return(out) diff --git a/R/elx_run_query.R b/R/elx_run_query.R index ee4144a..40fbf65 100644 --- a/R/elx_run_query.R +++ b/R/elx_run_query.R @@ -27,10 +27,15 @@ elx_run_query <- function(query = "", endpoint = "http://publications.europa.eu/ sparql_response <- graceful_http(curlready, headers = httr::add_headers('Accept' = 'application/sparql-results+xml'), verb = "GET") - - # parse response - sparql_response_parsed <- sparql_response %>% - elx_parse_xml() + + # if var created, continue + if (!is.null(sparql_response)){ + + # parse response + sparql_response_parsed <- sparql_response %>% + elx_parse_xml() + + } else {return(invisible(NULL))} # return return(sparql_response_parsed) @@ -44,68 +49,48 @@ elx_run_query <- function(query = "", endpoint = "http://publications.europa.eu/ #' @noRd #' -graceful_http <- function(remote_file, headers, verb = c("GET","HEAD")) { - - try_GET <- function(x, ...) { - tryCatch( - httr::GET(url = x, - #httr::timeout(1000000000), - headers), - error = function(e) conditionMessage(e), - warning = function(w) conditionMessage(w) - ) - } +graceful_http <- function(remote_file, headers = NULL, body = NULL, + verb = c("GET", "HEAD", "POST"), timeout = 100000, + content_type = NULL, encode = NULL) { - try_HEAD <- function(x, ...) { - tryCatch( - httr::HEAD(url = x, - #httr::timeout(1000000000), - headers), - error = function(e) conditionMessage(e), - warning = function(w) conditionMessage(w) - ) - } - - is_response <- function(x) { - class(x) == "response" - } - - # First check internet connection + # Check internet connection if (!curl::has_internet()) { message("No internet connection.") return(invisible(NULL)) } - if (verb == "GET"){ - - # Then try for timeout problems - resp <- try_GET(remote_file) - if (!is_response(resp)) { - message(resp) + # Make the HTTP request based on the verb + make_request <- function(verb) { + tryCatch({ + if (verb == "GET") { + httr::GET(url = remote_file, config = httr::timeout(timeout), headers) + } else if (verb == "HEAD") { + httr::HEAD(url = remote_file, config = httr::timeout(timeout), headers) + } else if (verb == "POST") { + httr::POST(url = remote_file, body = body, headers, + content_type = content_type, encode = encode) + } + }, + error = function(e) { + message("Error: ", conditionMessage(e)) return(invisible(NULL)) - } - - } - - else if (verb == "HEAD"){ - - # Then try for timeout problems - resp <- try_HEAD(remote_file) - if (!is_response(resp)) { - message(resp) + }, + warning = function(w) { + message("Warning: ", conditionMessage(w)) return(invisible(NULL)) - } - + }) } - - # Then stop if status > 400 + + # Execute the request + resp <- make_request(verb) + + # Check for HTTP errors if (httr::http_error(resp)) { httr::message_for_status(resp) return(invisible(NULL)) } - + return(resp) - } #' Parse RDF/XML triplets to data frame @@ -117,6 +102,7 @@ graceful_http <- function(remote_file, headers, verb = c("GET","HEAD")) { elx_parse_xml <- function(sparql_response = "", strip_uri = TRUE){ + # process XML response res_binding <- sparql_response %>% xml2::read_xml() %>% xml2::xml_find_all("//d1:binding") @@ -125,7 +111,8 @@ elx_parse_xml <- function(sparql_response = "", strip_uri = TRUE){ res_cols <- xml2::xml_attr(res_binding, "name") - if (identical(unique(res_cols), c("eurovoc","labels"))){ # for use in elx_label_eurovoc + # eurovoc labels + if (identical(unique(res_cols), c("eurovoc","labels"))){ out <- data.frame(res_cols, res_text) %>% dplyr::mutate(is_work = dplyr::if_else(res_cols=="eurovoc", T, NA)) %>% @@ -134,11 +121,14 @@ elx_parse_xml <- function(sparql_response = "", strip_uri = TRUE){ triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>% dplyr::ungroup() %>% tidyr::fill(.data$triplet) %>% - dplyr::select(-.data$is_work) %>% + dplyr::select(-"is_work") %>% tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>% - dplyr::select(-.data$triplet) + dplyr::select(-"triplet") - } else { + } + + # regular result + else { out <- data.frame(res_cols, res_text) %>% dplyr::mutate(is_work = dplyr::if_else(res_cols=="work", T, NA)) %>% @@ -147,9 +137,9 @@ elx_parse_xml <- function(sparql_response = "", strip_uri = TRUE){ triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>% dplyr::ungroup() %>% tidyr::fill(.data$triplet) %>% - dplyr::select(-.data$is_work) %>% + dplyr::select(-"is_work") %>% tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>% - dplyr::select(-.data$triplet) + dplyr::select(-"triplet") } @@ -164,6 +154,7 @@ elx_parse_xml <- function(sparql_response = "", strip_uri = TRUE){ } + # end return(out) } diff --git a/doc/eurlexpkg.html b/doc/eurlexpkg.html index c804959..6d7603c 100644 --- a/doc/eurlexpkg.html +++ b/doc/eurlexpkg.html @@ -784,7 +784,7 @@

Application

filter(!grepl("\\d", word)) %>% bind_tf_idf(word, celex, n) %>% with(wordcloud(word, tf_idf, max.words = 40)) -

+

I use term-frequency inverse-document frequency (tf-idf) to weight the importance of the words in the wordcloud. If we used pure frequencies, the wordcloud would largely consist of words conveying diff --git a/docs/404.html b/docs/404.html index b0c0e68..89e0462 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -56,11 +56,14 @@

  • diff --git a/docs/articles/council.html b/docs/articles/council.html index 57806cd..dce79f6 100644 --- a/docs/articles/council.html +++ b/docs/articles/council.html @@ -5,7 +5,7 @@ -Data on votes in the Council of the EU • eurlex +Voting in the Council of the EU • eurlex @@ -17,7 +17,7 @@ - + + + + + + +Make SPARQL queries with eurlex • eurlex + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + + +

    This vignette shows how to use the eurlex R package to +make SPARQL queries to retrieve data on European Union law.

    +
    +

    Introduction +

    +

    Dozens of political scientists and legal scholars use data on +European Union laws in their research. The provenance of these data is +rarely discussed. More often than not, researchers resort to the quick +and dirty technique of scraping entire html pages from +eur-lex.europa.eu. This is not the optimal, nor preferred +(from the perspective of the server host) approach of retrieving data, +however, especially as the Publication Office of the European Union, the +public body behind Eur-Lex, operates several dedicated APIs for +automated retrieval of its data.

    +

    The allure of web scraping is completely understandable. Not only is +it easier to download data that can be readily seen in a user-friendly +manner through a browser, using the dedicated APIs requires technical +knowledge of semantic web and Client URL technologies, which is not +necessarily widespread among researchers. And why go through the pain of +learning how to compile SPARQL queries when it is much easier to simply +download the web page?

    +

    The eurlex R package attempts to significantly reduce +the overhead associated with using the SPARQL and REST APIs made +available by the EU Publication Office. Although at present it does not +offer access to the same array of information as comprehensive web +scraping might, the package provides simpler, more efficient and +transparent access to data on European Union law. This vignette gives a +quick guide to the package and an even quicker introduction to the +Eur-Lex dataverse.

    +
    +
    +

    The eurlex package +

    +

    The eurlex package currently envisions the typical +use-case to consist of getting bulk information about EU law and policy +into R as fast as possible. The package contains three core functions to +achieve that objective: elx_make_query() to create SPARQL +queries based on user input; elx_run_query() to execute the +pre-made or any other manually input query; and +elx_fetch_data() to fire GET requests for certain metadata +to the REST API.

    +

    The package also contains largely self-explanatory functions for +retrieving data on EU court cases (elx_curia_list()) and +Council votes (elx_council_votes()) from outside Eur-Lex. +More advanced users might be interested in downloading and +custom-parsing XML notices with elx_download_xml().

    +
    +

    +elx_make_query(): Generate SPARQL queries +

    +

    The function elx_make_query takes as its first argument +the type of resource to be retrieved from the semantic database that +powers Eur-Lex (and other publications) called Cellar.

    +
    +library(eurlex)
    +library(dplyr)
    +
    +query_dir <- elx_make_query(resource_type = "directive")
    +

    Currently, it is possible to choose from among a host of resource +types, including directives, regulations and even case law (see function +description for the full list). It is also possible to manually specify +a resource type from the eligible +list.1

    +

    The choice of resource type is then reflected in the SPARQL query +generated by the function:

    +
    +query_dir %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex where{ ?work cdm:work_has_resource-type ?type. FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/DIR>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/DIR_IMPL>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/DIR_DEL>) 
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +
    +elx_make_query(resource_type = "caselaw") %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex where{ ?work cdm:work_has_resource-type ?type. FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/JUDG>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/ORDER>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/OPIN_JUR>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/THIRDPARTY_PROCEED>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/GARNISHEE_ORDER>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/RULING>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/JUDG_EXTRACT>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/INFO_JUDICIAL>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/VIEW_AG>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/OPIN_AG>) 
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +
    +elx_make_query(resource_type = "manual", manual_type = "SWD") %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex where{ ?work cdm:work_has_resource-type ?type.FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/SWD>) 
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +

    There are various ways of querying the same information in the Cellar +database due to the existence of several overlapping classes and +identifiers describing the same resources. The queries generated by the +function should offer a reliable way of obtaining exhaustive results, as +they have been validated by the helpdesk of the Publication Office. At +the same time, it is always possible there will be issues either on the +query or the database side; please report any you encounter through +Github.

    +

    The other arguments in elx_make_query() relate to +additional metadata to be returned. The results include by default the +CELEX +number and exclude corrigenda (corrections of errors in +legislation). Other data needs to be opted into. Make sure to select +ones that are logically compatible (e.g. case law does not have a legal +basis). More options should be added in the future.

    +

    Note that availability of data for each variable might have an impact +on the results. The data frame returned by the query might be shrunken +to the size of the variable with most missing data. It is recommended to +always compare results from a desired query to a minimal query +requesting only celex ids.

    +
    +elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex ?date ?force where{ ?work cdm:work_has_resource-type ?type. FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/DIR>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/DIR_IMPL>||
    +#>   ?type=<http://publications.europa.eu/resource/authority/resource-type/DIR_DEL>) 
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} OPTIONAL{?work cdm:work_date_document ?date.} OPTIONAL{?work cdm:resource_legal_in-force ?force.} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +
    +# minimal query: elx_make_query(resource_type = "directive")
    +
    +elx_make_query(resource_type = "recommendation", include_date = TRUE, include_lbs = TRUE) %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex ?date ?lbs ?lbcelex ?lbsuffix where{ ?work cdm:work_has_resource-type ?type. FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/RECO>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_DEC>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_DIR>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_OPIN>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_RES>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_REG>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_RECO>||
    +#>                    ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_DRAFT>) 
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} OPTIONAL{?work cdm:work_date_document ?date.} OPTIONAL{?work cdm:resource_legal_based_on_resource_legal ?lbs.
    +#>     ?lbs cdm:resource_legal_id_celex ?lbcelex.
    +#>     OPTIONAL{?bn owl:annotatedSource ?work.
    +#>     ?bn owl:annotatedProperty <http://publications.europa.eu/ontology/cdm#resource_legal_based_on_resource_legal>.
    +#>     ?bn owl:annotatedTarget ?lbs.
    +#>     ?bn annot:comment_on_legal_basis ?lbsuffix}} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +
    +# minimal query: elx_make_query(resource_type = "recommendation")
    +

    You can also decide to not specify any resource types, in which case +all types of documents will be returned. As there are over a million +documents with a CELEX identifier, this is likely not efficient for a +majority of users. But since version 0.3.5 it is possible to request +documents belonging to a particular “sector” +or directory +code.

    +
    +# request documents from directory 18 ("Common Foreign and Security Policy")
    +# and sector 3 ("Legal acts")
    +
    +elx_make_query(resource_type = "any",
    +               directory = "18",
    +               sector = 3) %>% 
    +  cat()
    +#> PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
    +#>   PREFIX annot: <http://publications.europa.eu/ontology/annotation#>
    +#>   PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    +#>   PREFIX dc:<http://purl.org/dc/elements/1.1/>
    +#>   PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
    +#>   PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    +#>   PREFIX owl:<http://www.w3.org/2002/07/owl#>
    +#>   select distinct ?work ?type ?celex where{
    +#>     VALUES (?value)
    +#>     { (<http://publications.europa.eu/resource/authority/fd_555/18>)
    +#>       (<http://publications.europa.eu/resource/authority/dir-eu-legal-act/18>)
    +#>     }
    +#>     {?work cdm:resource_legal_is_about_concept_directory-code ?value.
    +#>     }
    +#>     UNION
    +#>     {?work cdm:resource_legal_is_about_concept_directory-code ?directory.
    +#>       ?value skos:narrower+ ?directory.
    +#>     }
    +#>     
    +#>     ?work cdm:resource_legal_id_sector ?sector.
    +#>     FILTER(str(?sector)='3')
    +#>      
    +#>  FILTER not exists{?work cdm:work_has_resource-type <http://publications.europa.eu/resource/authority/resource-type/CORRIGENDUM>} OPTIONAL{?work cdm:resource_legal_id_celex ?celex.} FILTER not exists{?work cdm:do_not_index "true"^^<http://www.w3.org/2001/XMLSchema#boolean>}. }
    +
    +
    +
    +
    +
      +
    1. Note, however, that not all resource types will work +properly with the pre-specified query.↩︎

    2. +
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.7.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/authors.html b/docs/authors.html index 2239af5..04ac7d6 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -32,11 +32,14 @@
  • Changelog diff --git a/docs/index.html b/docs/index.html index 26886f2..f67a5b2 100644 --- a/docs/index.html +++ b/docs/index.html @@ -40,7 +40,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -57,11 +57,14 @@
  • diff --git a/docs/news/index.html b/docs/news/index.html index 49ba53b..d66adf1 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -32,11 +32,14 @@
  • Changelog @@ -59,6 +62,9 @@
  • Changelog diff --git a/docs/reference/elx_curia_list.html b/docs/reference/elx_curia_list.html index 031932d..ffa0843 100644 --- a/docs/reference/elx_curia_list.html +++ b/docs/reference/elx_curia_list.html @@ -18,7 +18,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -33,11 +33,14 @@
  • Changelog diff --git a/docs/reference/elx_download_xml.html b/docs/reference/elx_download_xml.html index 7684f6d..7779e82 100644 --- a/docs/reference/elx_download_xml.html +++ b/docs/reference/elx_download_xml.html @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -32,11 +32,14 @@
  • Changelog diff --git a/docs/reference/elx_fetch_data.html b/docs/reference/elx_fetch_data.html index 54f151f..c7314b6 100644 --- a/docs/reference/elx_fetch_data.html +++ b/docs/reference/elx_fetch_data.html @@ -1,5 +1,5 @@ -Retrieve additional data on EU documents — elx_fetch_data • eurlexRetrieve additional data on EU documents — elx_fetch_data • eurlex @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -32,11 +32,14 @@
  • Changelog @@ -56,7 +59,7 @@

    Retrieve additional data on EU documents

    -

    Wraps httr::GET with pre-specified headers and parses retrieved data.

    +

    Get titles, texts, identifiers and XML notices for EU resources.

    diff --git a/docs/reference/elx_label_eurovoc.html b/docs/reference/elx_label_eurovoc.html index c95ab58..ec8a3a1 100644 --- a/docs/reference/elx_label_eurovoc.html +++ b/docs/reference/elx_label_eurovoc.html @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6
    @@ -32,11 +32,14 @@
  • Changelog diff --git a/docs/reference/elx_make_query.html b/docs/reference/elx_make_query.html index f3e61ac..fb3288b 100644 --- a/docs/reference/elx_make_query.html +++ b/docs/reference/elx_make_query.html @@ -19,7 +19,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -34,11 +34,14 @@
  • Changelog diff --git a/docs/reference/elx_run_query.html b/docs/reference/elx_run_query.html index 76fd7d8..fa68caa 100644 --- a/docs/reference/elx_run_query.html +++ b/docs/reference/elx_run_query.html @@ -19,7 +19,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -34,11 +34,14 @@
  • Changelog diff --git a/docs/reference/index.html b/docs/reference/index.html index cf417a6..bea4c72 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ eurlex - 0.4.5 + 0.4.6 @@ -32,11 +32,14 @@
  • Changelog diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 7b649e7..0c82d82 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -12,6 +12,9 @@ /articles/index.html + + /articles/sparql-queries.html + /authors.html diff --git a/man/elx_fetch_data.Rd b/man/elx_fetch_data.Rd index 3a77090..ae941ed 100644 --- a/man/elx_fetch_data.Rd +++ b/man/elx_fetch_data.Rd @@ -36,7 +36,7 @@ elx_fetch_data( A character vector of length one containing the result. When \code{type = "text"}, named character vector where the name contains the source of the text. } \description{ -Wraps httr::GET with pre-specified headers and parses retrieved data. +Get titles, texts, identifiers and XML notices for EU resources. } \examples{ \donttest{ diff --git a/tests/testthat/test-fetch.R b/tests/testthat/test-fetch.R index c4f1c3b..5b8a139 100644 --- a/tests/testthat/test-fetch.R +++ b/tests/testthat/test-fetch.R @@ -1,4 +1,4 @@ -testthat::test_that("fetching data works", { +testthat::test_that("fetching notices works", { testthat::skip_on_cran() diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 64631e1..6bf5737 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -1,4 +1,4 @@ -testthat::test_that("directives work", { +testthat::test_that("queries can be made", { testthat::skip_on_cran() diff --git a/vignettes/council.Rmd b/vignettes/articles/council.Rmd similarity index 95% rename from vignettes/council.Rmd rename to vignettes/articles/council.Rmd index 351d680..bcc5259 100644 --- a/vignettes/council.Rmd +++ b/vignettes/articles/council.Rmd @@ -1,13 +1,13 @@ --- -title: "Data on votes in the Council of the EU" +title: "Voting in the Council of the EU" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Data on votes in the Council of the EU} + %\VignetteIndexEntry{Voting in the Council of the EU} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r, echo = FALSE, message = FALSE, warning=FALSE, error=FALSE, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -18,7 +18,9 @@ Few would disagree that the Council of the European Union (EU) -- sometimes also Under the OLP, which is nowadays the most common type of law-making procedure, the Council should make decisions by qualified majority. In practice, it often decides by consensus, as Member States tend to avoid open disagreements. Still, enough votes are taken to give us some insight into the variation in Member State governments' behaviour. We access these through a dedicated API maintained by the Council, which is also wrapped in the `eurlex` package. -## Council votes +## Data on Council votes + +First we obtain the available data on Council votes using `eurlex::elx_council_votes()` and process the API response. ```{r votingdata} # packages @@ -60,7 +62,6 @@ country_votes_prop <- country_votes_n %>% n_votes = sum(n), prop = round(value / n_votes, 3)) %>% ungroup() - ``` Excluding votes where all governments voted in favour, we are left with between ```r max(country_votes_prop$n_votes, na.rm = T)``` and ```r min(country_votes_prop$n_votes, na.rm = T)``` votes per Member State. While these numbers do not represent the entire historical voting record, they should still help us lift the veil on variation in Member States' propensity to disagree. Note that due to opt-outs not all countries have participated in every vote. diff --git a/vignettes/eurlexpkg.Rmd b/vignettes/articles/eurlexpkg.Rmd similarity index 98% rename from vignettes/eurlexpkg.Rmd rename to vignettes/articles/eurlexpkg.Rmd index bf00a2f..75e5f22 100644 --- a/vignettes/eurlexpkg.Rmd +++ b/vignettes/articles/eurlexpkg.Rmd @@ -1,267 +1,267 @@ ---- -title: "eurlex: Retrieve data on European Union law in R" -output: rmarkdown::html_vignette -description: > - Retrieve data on European Union law in R with - pre-defined SPARQL and REST queries. -vignette: > - %\VignetteIndexEntry{eurlex: Retrieve data on European Union law in R} - %\VignetteEngine{knitr::rmarkdown} - \usepackage[utf8]{inputenc} ---- - -```{r, echo = FALSE, message = FALSE, warning=FALSE, error=FALSE} -knitr::opts_chunk$set(collapse = T, comment = "#>") -options(tibble.print_min = 4, tibble.print_max = 4) -``` - -This vignette shows how to use the `eurlex` R package to retrieve data on European Union law. - -# Introduction - -Dozens of political scientists and legal scholars use data on European Union laws in their research. The provenance of these data is rarely discussed. More often than not, researchers resort to the quick and dirty technique of scraping entire html pages from `eur-lex.europa.eu`. This is not the optimal, nor preferred (from the perspective of the server host) approach of retrieving data, however, especially as the Publication Office of the European Union, the public body behind Eur-Lex, operates several dedicated APIs for automated retrieval of its data. - -The allure of web scraping is completely understandable. Not only is it easier to download data that can be readily seen in a user-friendly manner through a browser, using the dedicated APIs requires technical knowledge of semantic web and Client URL technologies, which is not necessarily widespread among researchers. And why go through the pain of learning how to compile SPARQL queries when it is much easier to simply download the web page? - -The `eurlex` R package attempts to significantly reduce the overhead associated with using the SPARQL and REST APIs made available by the EU Publication Office. Although at present it does not offer access to the same array of information as comprehensive web scraping might, the package provides simpler, more efficient and transparent access to data on European Union law. This vignette gives a quick guide to the package and an even quicker introduction to the Eur-Lex dataverse. - -# The `eurlex` package - -The `eurlex` package currently envisions the typical use-case to consist of getting bulk information about EU law and policy into R as fast as possible. The package contains three core functions to achieve that objective: `elx_make_query()` to create SPARQL queries based on user input; `elx_run_query()` to execute the pre-made or any other manually input query; and `elx_fetch_data()` to fire GET requests for certain metadata to the REST API. - -The package also contains largely self-explanatory functions for retrieving data on EU court cases (`elx_curia_list()`) and Council votes (`elx_council_votes()`) from outside Eur-Lex. More advanced users might be interested in downloading and custom-parsing XML notices with `elx_download_xml()`. - -## `elx_make_query()`: Generate SPARQL queries - -The function `elx_make_query` takes as its first argument the type of resource to be retrieved from the semantic database that powers Eur-Lex (and other publications) called Cellar. - -```{r makequery, message = FALSE, warning=FALSE, error=FALSE} -library(eurlex) -library(dplyr) - -query_dir <- elx_make_query(resource_type = "directive") -``` - - -```{r precompute, include=FALSE} -dirs <- elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% - elx_run_query() - -results <- dirs %>% select(-force,-date) -``` - -Currently, it is possible to choose from among a host of resource types, including directives, regulations and even case law (see function description for the full list). It is also possible to manually specify a resource type from the [eligible list](http://publications.europa.eu/resource/authority/resource-type).^[Note, however, that not all resource types will work properly with the pre-specified query.] - -The choice of resource type is then reflected in the SPARQL query generated by the function: - -```{r} -query_dir %>% - cat() - -elx_make_query(resource_type = "caselaw") %>% - cat() - -elx_make_query(resource_type = "manual", manual_type = "SWD") %>% - cat() - -``` - -There are various ways of querying the same information in the Cellar database due to the existence of several overlapping classes and identifiers describing the same resources. The queries generated by the function should offer a reliable way of obtaining exhaustive results, as they have been validated by the helpdesk of the Publication Office. At the same time, it is always possible there will be issues either on the query or the database side; please report any you encounter through Github. - -The other arguments in `elx_make_query()` relate to additional metadata to be returned. The results include by default the [CELEX number](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) and exclude corrigenda (corrections of errors in legislation). Other data needs to be opted into. Make sure to select ones that are logically compatible (e.g. case law does not have a legal basis). More options should be added in the future. - -Note that availability of data for each variable might have an impact on the results. The data frame returned by the query might be shrunken to the size of the variable with most missing data. It is recommended to always compare results from a desired query to a minimal query requesting only celex ids. - -```{r} -elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% - cat() - -# minimal query: elx_make_query(resource_type = "directive") - -elx_make_query(resource_type = "recommendation", include_date = TRUE, include_lbs = TRUE) %>% - cat() - -# minimal query: elx_make_query(resource_type = "recommendation") - -``` - -You can also decide to not specify any resource types, in which case all types of documents will be returned. As there are over a million documents with a CELEX identifier, this is likely not efficient for a majority of users. But since version 0.3.5 it is possible to request documents belonging to a particular ["sector"](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) or [directory code](https://eur-lex.europa.eu/browse/directories/legislation.html). - -```{r} -# request documents from directory 18 ("Common Foreign and Security Policy") -# and sector 3 ("Legal acts") - -elx_make_query(resource_type = "any", - directory = "18", - sector = 3) %>% - cat() -``` - -Now that we have a query, we are ready to run it. - -## `elx_run_query()`: Execute SPARQL queries - -`elx_run_query()` sends SPARQL queries to a pre-specified endpoint. The function takes the query string as the main argument, which means you can manually pass it any working SPARQL query (relevant to official EU publications). - -```{r runquery, eval=FALSE} -results <- elx_run_query(query = query_dir) - -# the functions are compatible with piping -# -# elx_make_query("directive") %>% -# elx_run_query() -``` - -```{r} -as_tibble(results) -``` - -The function outputs a `data.frame` where each column corresponds to one of the requested variables, while the rows accumulate observations of the resource type satisfying the query criteria. Obviously, the more data is to be returned, the longer the execution time, varying from a few seconds to several minutes, depending also on your connection. - -The first column always contains the unique URI of a "work" (legislative act or court judgment) which identifies each resource in Cellar. Several human-readable identifiers are normally associated with each "work" but the most useful one is CELEX, retrieved by default.^[Occasionally, you may encounter legal acts without CELEX numbers, especially when digging through older legislation. It is good to report these to the Eur-Lex helpdesk.] - -One column you should always pay attention to is `type` (as in `resource_type`). The URIs contained there reflect the FILTER argument in the SPARQL query, which is manually pre-specified. All resources are indexed as being of one type or another. For example, when retrieving directives, the results are going to return also delegated directives, which might not be desirable, depending on your needs. You can filter results by `type` to make the necessary adjustments. The queries are expansive by default in the spirit of erring on the side of over-inclusiveness rather than vice versa. - -```{r} -head(results$type,5) - -results %>% - distinct(type) -``` - -The data is returned in the long format, which means that rows are recycled up to the length of the variable with the most data points. For example, if 20 directives are returned, each with two legal bases, the resulting `data.frame` will have 40 rows. Some variables, such as dates, contain unexpectedly several entries for some documents. You should always check the number of unique identifiers in the results instead of assuming that each row is a unique observation. - -### EuroVoc descriptors - -EuroVoc is a multilingual thesaurus, keywords from which are used to describe the content of European Union documents. Most resource types that can be retrieved with the pre-defined queries in this package can be accompanied by EuroVoc keywords and these can be retrieved as other variables. - -```{r eurovoc} - -rec_eurovoc <- elx_make_query("recommendation", include_eurovoc = TRUE, limit = 10) %>% - elx_run_query() # truncated results for sake of the example - -rec_eurovoc %>% - select(celex, eurovoc) - -``` - -By default, the endpoint returns the EuroVoc concept codes rather than the labels (keywords). The function `elx_label_eurovoc()` needs to be called to obtain a look-up table with the labels. - -```{r eurovoctable} -eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc) - -print(eurovoc_lookup) -``` - -The results include labels only for unique identifiers, but with `dplyr::left_join()` it is straightforward to append the labels to the entire dataset. - -```{r appendlabs} -rec_eurovoc %>% - left_join(eurovoc_lookup) -``` - -As elsewhere in the API, we can tap into the multilingual nature of EU documents also when it comes to the EuroVoc keywords. Moreover, most concepts in the thesaurus are associated with alternative labels; these can be returned as well (separated by a comma). - -```{r} -eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc, - alt_labels = TRUE, - language = "sk") - -rec_eurovoc %>% - left_join(eurovoc_lookup) %>% - select(celex, eurovoc, labels) -``` - -## `elx_fetch_data()`: Fire GET requests - -A core contribution of the SPARQL requests is that we obtain a comprehensive list of identifiers that we can subsequently use to obtain more data relating to the document in question. While the results of the SPARQL queries are useful also for webscraping (with the `rvest` package), the function `elx_fetch_data()` enables us to fire GET requests to retrieve data on documents with known identifiers (including Cellar URI). - -One of the most sought-after data in the Eur-Lex dataverse is the text. It is possible now to automate the pipeline for downloading html and plain texts from Eur-Lex. Similarly, you can retrieve the title of the document. For both you can specify also the desired language (English by default). Other metadata might be added in the future. - -```{r getdatapur, message = FALSE, warning=FALSE, error=FALSE} -# the function is not vectorized by default -# elx_fetch_data(url = results$work[1], type = "title") - -# we can use purrr::map() to play that role -library(purrr) - -# wrapping in possibly() would catch errors in case there is a server issue -dir_titles <- results[1:5,] %>% # take the first 5 directives only to save time - mutate(work = paste("http://publications.europa.eu/resource/cellar/", work, sep = "")) |> - mutate(title = map_chr(work, possibly(elx_fetch_data, otherwise = NA_character_), - "title")) %>% - as_tibble() %>% - select(celex, title) - -print(dir_titles) - -``` - -Note that text requests are by far the most time-intensive; requesting the full text for thousands of documents is liable to extend the run-time into hours. Texts are retrieved from html by priority, but methods for .pdfs and .docs are also implemented.^[It is worth pointing out that the html and pdf contents of older case law differs. Whereas typically the html file is only going to contain a summary and grounds of a judgment, the pdf should also contain background to the dispute.] The function even handles multi-document resources (by pasting them together). - -# Application - -In this section I showcase a simple application of `eurlex` on making overviews of EU legislation. First, we collate data on directives. - -```{r dirsdata, eval=FALSE} -dirs <- elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% - elx_run_query() -``` - -Let's calculate the proportion of directives currently in force in the entire set of directives ever adopted. This variable offers a particularly good demonstration of the usefulness of the package to retrieve EU law data, because it changes every day, as new acts enter into force and old ones drop out. Regularly scraping webpages for this purpose and scale is simply impractical and disproportional. - -```{r firstplot, message = FALSE, warning=FALSE, error=FALSE} -library(ggplot2) - -dirs %>% - count(force) %>% - ggplot(aes(x = force, y = n)) + - geom_col() -``` - -Directives become naturally outdated with time. It might be all the more interesting to see which older acts are thus still surviving. - -```{r dirforce} -dirs %>% - filter(!is.na(force)) %>% - mutate(date = as.Date(date)) %>% - ggplot(aes(x = date, y = celex)) + - geom_point(aes(color = force), alpha = 0.1) + - theme(axis.text.y = element_blank(), - axis.line.y = element_blank(), - axis.ticks.y = element_blank()) -``` - -We want to know a bit more about some directives from the early 1970s that are still in force today. Their titles could give us a clue. - -```{r dirtitles} -dirs_1970_title <- dirs %>% - filter(between(as.Date(date), as.Date("1970-01-01"), as.Date("1973-01-01")), - force == "true") %>% - mutate(work = paste("http://publications.europa.eu/resource/cellar/", work, sep = "")) |> - mutate(title = map_chr(work, possibly(elx_fetch_data, otherwise = NA_character_), - "title")) %>% - as_tibble() - -print(dirs_1970_title) -``` - -I will use the `tidytext` package to get a quick idea of what the legislation is about. - -```{r wordcloud, message = FALSE, warning=FALSE, error=FALSE} -library(tidytext) -library(wordcloud) - -# wordcloud -dirs_1970_title %>% - select(celex,title) %>% - unnest_tokens(word, title) %>% - count(celex, word, sort = TRUE) %>% - filter(!grepl("\\d", word)) %>% - bind_tf_idf(word, celex, n) %>% - with(wordcloud(word, tf_idf, max.words = 40)) -``` - -I use term-frequency inverse-document frequency (tf-idf) to weight the importance of the words in the wordcloud. If we used pure frequencies, the wordcloud would largely consist of words conveying little meaning ("the", "and", ...). - -This is an extremely basic application of the `eurlex` package. Much more sophisticated methods can be used to analyse both the content and metadata of European Union legislation. If the package is useful for your research, please cite the [accompanying paper](https://www.tandfonline.com/doi/full/10.1080/2474736X.2020.1870150).^[Michal Ovádek (2021) Facilitating access to data on European Union laws, Political Research Exchange, 3:1, DOI: [10.1080/2474736X.2020.1870150](https://www.tandfonline.com/doi/full/10.1080/2474736X.2020.1870150)] +--- +title: "eurlex: Retrieve data on European Union law in R" +output: rmarkdown::html_vignette +description: > + Retrieve data on European Union law in R with + pre-defined SPARQL and REST queries. +vignette: > + %\VignetteIndexEntry{eurlex: Retrieve data on European Union law in R} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE, message = FALSE, warning=FALSE, error=FALSE} +knitr::opts_chunk$set(collapse = T, comment = "#>") +options(tibble.print_min = 4, tibble.print_max = 4) +``` + +This vignette shows how to use the `eurlex` R package to retrieve data on European Union law. + +# Introduction + +Dozens of political scientists and legal scholars use data on European Union laws in their research. The provenance of these data is rarely discussed. More often than not, researchers resort to the quick and dirty technique of scraping entire html pages from `eur-lex.europa.eu`. This is not the optimal, nor preferred (from the perspective of the server host) approach of retrieving data, however, especially as the Publication Office of the European Union, the public body behind Eur-Lex, operates several dedicated APIs for automated retrieval of its data. + +The allure of web scraping is completely understandable. Not only is it easier to download data that can be readily seen in a user-friendly manner through a browser, using the dedicated APIs requires technical knowledge of semantic web and Client URL technologies, which is not necessarily widespread among researchers. And why go through the pain of learning how to compile SPARQL queries when it is much easier to simply download the web page? + +The `eurlex` R package attempts to significantly reduce the overhead associated with using the SPARQL and REST APIs made available by the EU Publication Office. Although at present it does not offer access to the same array of information as comprehensive web scraping might, the package provides simpler, more efficient and transparent access to data on European Union law. This vignette gives a quick guide to the package and an even quicker introduction to the Eur-Lex dataverse. + +# The `eurlex` package + +The `eurlex` package currently envisions the typical use-case to consist of getting bulk information about EU law and policy into R as fast as possible. The package contains three core functions to achieve that objective: `elx_make_query()` to create SPARQL queries based on user input; `elx_run_query()` to execute the pre-made or any other manually input query; and `elx_fetch_data()` to fire GET requests for certain metadata to the REST API. + +The package also contains largely self-explanatory functions for retrieving data on EU court cases (`elx_curia_list()`) and Council votes (`elx_council_votes()`) from outside Eur-Lex. More advanced users might be interested in downloading and custom-parsing XML notices with `elx_download_xml()`. + +## `elx_make_query()`: Generate SPARQL queries + +The function `elx_make_query` takes as its first argument the type of resource to be retrieved from the semantic database that powers Eur-Lex (and other publications) called Cellar. + +```{r makequery, message = FALSE, warning=FALSE, error=FALSE} +library(eurlex) +library(dplyr) + +query_dir <- elx_make_query(resource_type = "directive") +``` + + +```{r precompute, include=FALSE} +dirs <- elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% + elx_run_query() + +results <- dirs %>% select(-force,-date) +``` + +Currently, it is possible to choose from among a host of resource types, including directives, regulations and even case law (see function description for the full list). It is also possible to manually specify a resource type from the [eligible list](http://publications.europa.eu/resource/authority/resource-type).^[Note, however, that not all resource types will work properly with the pre-specified query.] + +The choice of resource type is then reflected in the SPARQL query generated by the function: + +```{r} +query_dir %>% + cat() + +elx_make_query(resource_type = "caselaw") %>% + cat() + +elx_make_query(resource_type = "manual", manual_type = "SWD") %>% + cat() + +``` + +There are various ways of querying the same information in the Cellar database due to the existence of several overlapping classes and identifiers describing the same resources. The queries generated by the function should offer a reliable way of obtaining exhaustive results, as they have been validated by the helpdesk of the Publication Office. At the same time, it is always possible there will be issues either on the query or the database side; please report any you encounter through Github. + +The other arguments in `elx_make_query()` relate to additional metadata to be returned. The results include by default the [CELEX number](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) and exclude corrigenda (corrections of errors in legislation). Other data needs to be opted into. Make sure to select ones that are logically compatible (e.g. case law does not have a legal basis). More options should be added in the future. + +Note that availability of data for each variable might have an impact on the results. The data frame returned by the query might be shrunken to the size of the variable with most missing data. It is recommended to always compare results from a desired query to a minimal query requesting only celex ids. + +```{r} +elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% + cat() + +# minimal query: elx_make_query(resource_type = "directive") + +elx_make_query(resource_type = "recommendation", include_date = TRUE, include_lbs = TRUE) %>% + cat() + +# minimal query: elx_make_query(resource_type = "recommendation") + +``` + +You can also decide to not specify any resource types, in which case all types of documents will be returned. As there are over a million documents with a CELEX identifier, this is likely not efficient for a majority of users. But since version 0.3.5 it is possible to request documents belonging to a particular ["sector"](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) or [directory code](https://eur-lex.europa.eu/browse/directories/legislation.html). + +```{r} +# request documents from directory 18 ("Common Foreign and Security Policy") +# and sector 3 ("Legal acts") + +elx_make_query(resource_type = "any", + directory = "18", + sector = 3) %>% + cat() +``` + +Now that we have a query, we are ready to run it. + +## `elx_run_query()`: Execute SPARQL queries + +`elx_run_query()` sends SPARQL queries to a pre-specified endpoint. The function takes the query string as the main argument, which means you can manually pass it any working SPARQL query (relevant to official EU publications). + +```{r runquery, eval=FALSE} +results <- elx_run_query(query = query_dir) + +# the functions are compatible with piping +# +# elx_make_query("directive") %>% +# elx_run_query() +``` + +```{r} +as_tibble(results) +``` + +The function outputs a `data.frame` where each column corresponds to one of the requested variables, while the rows accumulate observations of the resource type satisfying the query criteria. Obviously, the more data is to be returned, the longer the execution time, varying from a few seconds to several minutes, depending also on your connection. + +The first column always contains the unique URI of a "work" (legislative act or court judgment) which identifies each resource in Cellar. Several human-readable identifiers are normally associated with each "work" but the most useful one is CELEX, retrieved by default.^[Occasionally, you may encounter legal acts without CELEX numbers, especially when digging through older legislation. It is good to report these to the Eur-Lex helpdesk.] + +One column you should always pay attention to is `type` (as in `resource_type`). The URIs contained there reflect the FILTER argument in the SPARQL query, which is manually pre-specified. All resources are indexed as being of one type or another. For example, when retrieving directives, the results are going to return also delegated directives, which might not be desirable, depending on your needs. You can filter results by `type` to make the necessary adjustments. The queries are expansive by default in the spirit of erring on the side of over-inclusiveness rather than vice versa. + +```{r} +head(results$type,5) + +results %>% + distinct(type) +``` + +The data is returned in the long format, which means that rows are recycled up to the length of the variable with the most data points. For example, if 20 directives are returned, each with two legal bases, the resulting `data.frame` will have 40 rows. Some variables, such as dates, contain unexpectedly several entries for some documents. You should always check the number of unique identifiers in the results instead of assuming that each row is a unique observation. + +### EuroVoc descriptors + +EuroVoc is a multilingual thesaurus, keywords from which are used to describe the content of European Union documents. Most resource types that can be retrieved with the pre-defined queries in this package can be accompanied by EuroVoc keywords and these can be retrieved as other variables. + +```{r eurovoc} + +rec_eurovoc <- elx_make_query("recommendation", include_eurovoc = TRUE, limit = 10) %>% + elx_run_query() # truncated results for sake of the example + +rec_eurovoc %>% + select(celex, eurovoc) + +``` + +By default, the endpoint returns the EuroVoc concept codes rather than the labels (keywords). The function `elx_label_eurovoc()` needs to be called to obtain a look-up table with the labels. + +```{r eurovoctable} +eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc) + +print(eurovoc_lookup) +``` + +The results include labels only for unique identifiers, but with `dplyr::left_join()` it is straightforward to append the labels to the entire dataset. + +```{r appendlabs} +rec_eurovoc %>% + left_join(eurovoc_lookup) +``` + +As elsewhere in the API, we can tap into the multilingual nature of EU documents also when it comes to the EuroVoc keywords. Moreover, most concepts in the thesaurus are associated with alternative labels; these can be returned as well (separated by a comma). + +```{r} +eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc, + alt_labels = TRUE, + language = "sk") + +rec_eurovoc %>% + left_join(eurovoc_lookup) %>% + select(celex, eurovoc, labels) +``` + +## `elx_fetch_data()`: Fire GET requests + +A core contribution of the SPARQL requests is that we obtain a comprehensive list of identifiers that we can subsequently use to obtain more data relating to the document in question. While the results of the SPARQL queries are useful also for webscraping (with the `rvest` package), the function `elx_fetch_data()` enables us to fire GET requests to retrieve data on documents with known identifiers (including Cellar URI). + +One of the most sought-after data in the Eur-Lex dataverse is the text. It is possible now to automate the pipeline for downloading html and plain texts from Eur-Lex. Similarly, you can retrieve the title of the document. For both you can specify also the desired language (English by default). Other metadata might be added in the future. + +```{r getdatapur, message = FALSE, warning=FALSE, error=FALSE} +# the function is not vectorized by default +# elx_fetch_data(url = results$work[1], type = "title") + +# we can use purrr::map() to play that role +library(purrr) + +# wrapping in possibly() would catch errors in case there is a server issue +dir_titles <- results[1:5,] %>% # take the first 5 directives only to save time + mutate(work = paste("http://publications.europa.eu/resource/cellar/", work, sep = "")) |> + mutate(title = map_chr(work, possibly(elx_fetch_data, otherwise = NA_character_), + "title")) %>% + as_tibble() %>% + select(celex, title) + +print(dir_titles) + +``` + +Note that text requests are by far the most time-intensive; requesting the full text for thousands of documents is liable to extend the run-time into hours. Texts are retrieved from html by priority, but methods for .pdfs and .docs are also implemented.^[It is worth pointing out that the html and pdf contents of older case law differs. Whereas typically the html file is only going to contain a summary and grounds of a judgment, the pdf should also contain background to the dispute.] The function even handles multi-document resources (by pasting them together). + +# Application + +In this section I showcase a simple application of `eurlex` on making overviews of EU legislation. First, we collate data on directives. + +```{r dirsdata, eval=FALSE} +dirs <- elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% + elx_run_query() +``` + +Let's calculate the proportion of directives currently in force in the entire set of directives ever adopted. This variable offers a particularly good demonstration of the usefulness of the package to retrieve EU law data, because it changes every day, as new acts enter into force and old ones drop out. Regularly scraping webpages for this purpose and scale is simply impractical and disproportional. + +```{r firstplot, message = FALSE, warning=FALSE, error=FALSE} +library(ggplot2) + +dirs %>% + count(force) %>% + ggplot(aes(x = force, y = n)) + + geom_col() +``` + +Directives become naturally outdated with time. It might be all the more interesting to see which older acts are thus still surviving. + +```{r dirforce} +dirs %>% + filter(!is.na(force)) %>% + mutate(date = as.Date(date)) %>% + ggplot(aes(x = date, y = celex)) + + geom_point(aes(color = force), alpha = 0.1) + + theme(axis.text.y = element_blank(), + axis.line.y = element_blank(), + axis.ticks.y = element_blank()) +``` + +We want to know a bit more about some directives from the early 1970s that are still in force today. Their titles could give us a clue. + +```{r dirtitles} +dirs_1970_title <- dirs %>% + filter(between(as.Date(date), as.Date("1970-01-01"), as.Date("1973-01-01")), + force == "true") %>% + mutate(work = paste("http://publications.europa.eu/resource/cellar/", work, sep = "")) |> + mutate(title = map_chr(work, possibly(elx_fetch_data, otherwise = NA_character_), + "title")) %>% + as_tibble() + +print(dirs_1970_title) +``` + +I will use the `tidytext` package to get a quick idea of what the legislation is about. + +```{r wordcloud, message = FALSE, warning=FALSE, error=FALSE} +library(tidytext) +library(wordcloud) + +# wordcloud +dirs_1970_title %>% + select(celex,title) %>% + unnest_tokens(word, title) %>% + count(celex, word, sort = TRUE) %>% + filter(!grepl("\\d", word)) %>% + bind_tf_idf(word, celex, n) %>% + with(wordcloud(word, tf_idf, max.words = 40)) +``` + +I use term-frequency inverse-document frequency (tf-idf) to weight the importance of the words in the wordcloud. If we used pure frequencies, the wordcloud would largely consist of words conveying little meaning ("the", "and", ...). + +This is an extremely basic application of the `eurlex` package. Much more sophisticated methods can be used to analyse both the content and metadata of European Union legislation. If the package is useful for your research, please cite the [accompanying paper](https://www.tandfonline.com/doi/full/10.1080/2474736X.2020.1870150).^[Michal Ovádek (2021) Facilitating access to data on European Union laws, Political Research Exchange, 3:1, DOI: [10.1080/2474736X.2020.1870150](https://www.tandfonline.com/doi/full/10.1080/2474736X.2020.1870150)] diff --git a/vignettes/sparql-queries.Rmd b/vignettes/sparql-queries.Rmd new file mode 100644 index 0000000..4fd6103 --- /dev/null +++ b/vignettes/sparql-queries.Rmd @@ -0,0 +1,101 @@ +--- +title: "Make SPARQL queries with eurlex" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Make SPARQL queries with eurlex} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(eurlex) +``` + +This vignette shows how to use the `eurlex` R package to make SPARQL queries to retrieve data on European Union law. + +# Introduction + +Dozens of political scientists and legal scholars use data on European Union laws in their research. The provenance of these data is rarely discussed. More often than not, researchers resort to the quick and dirty technique of scraping entire html pages from `eur-lex.europa.eu`. This is not the optimal, nor preferred (from the perspective of the server host) approach of retrieving data, however, especially as the Publication Office of the European Union, the public body behind Eur-Lex, operates several dedicated APIs for automated retrieval of its data. + +The allure of web scraping is completely understandable. Not only is it easier to download data that can be readily seen in a user-friendly manner through a browser, using the dedicated APIs requires technical knowledge of semantic web and Client URL technologies, which is not necessarily widespread among researchers. And why go through the pain of learning how to compile SPARQL queries when it is much easier to simply download the web page? + +The `eurlex` R package attempts to significantly reduce the overhead associated with using the SPARQL and REST APIs made available by the EU Publication Office. Although at present it does not offer access to the same array of information as comprehensive web scraping might, the package provides simpler, more efficient and transparent access to data on European Union law. This vignette gives a quick guide to the package and an even quicker introduction to the Eur-Lex dataverse. + +# The `eurlex` package + +The `eurlex` package currently envisions the typical use-case to consist of getting bulk information about EU law and policy into R as fast as possible. The package contains three core functions to achieve that objective: `elx_make_query()` to create SPARQL queries based on user input; `elx_run_query()` to execute the pre-made or any other manually input query; and `elx_fetch_data()` to fire GET requests for certain metadata to the REST API. + +The package also contains largely self-explanatory functions for retrieving data on EU court cases (`elx_curia_list()`) and Council votes (`elx_council_votes()`) from outside Eur-Lex. More advanced users might be interested in downloading and custom-parsing XML notices with `elx_download_xml()`. + +## `elx_make_query()`: Generate SPARQL queries + +The function `elx_make_query` takes as its first argument the type of resource to be retrieved from the semantic database that powers Eur-Lex (and other publications) called Cellar. + +```{r makequery, message = FALSE, warning=FALSE, error=FALSE} +library(eurlex) +library(dplyr) + +query_dir <- elx_make_query(resource_type = "directive") +``` + + +```{r precompute, include=FALSE} +dirs <- elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% + elx_run_query() + +results <- dirs %>% select(-force,-date) +``` + +Currently, it is possible to choose from among a host of resource types, including directives, regulations and even case law (see function description for the full list). It is also possible to manually specify a resource type from the [eligible list](http://publications.europa.eu/resource/authority/resource-type).^[Note, however, that not all resource types will work properly with the pre-specified query.] + +The choice of resource type is then reflected in the SPARQL query generated by the function: + +```{r} +query_dir %>% + cat() + +elx_make_query(resource_type = "caselaw") %>% + cat() + +elx_make_query(resource_type = "manual", manual_type = "SWD") %>% + cat() + +``` + +There are various ways of querying the same information in the Cellar database due to the existence of several overlapping classes and identifiers describing the same resources. The queries generated by the function should offer a reliable way of obtaining exhaustive results, as they have been validated by the helpdesk of the Publication Office. At the same time, it is always possible there will be issues either on the query or the database side; please report any you encounter through Github. + +The other arguments in `elx_make_query()` relate to additional metadata to be returned. The results include by default the [CELEX number](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) and exclude corrigenda (corrections of errors in legislation). Other data needs to be opted into. Make sure to select ones that are logically compatible (e.g. case law does not have a legal basis). More options should be added in the future. + +Note that availability of data for each variable might have an impact on the results. The data frame returned by the query might be shrunken to the size of the variable with most missing data. It is recommended to always compare results from a desired query to a minimal query requesting only celex ids. + +```{r} +elx_make_query(resource_type = "directive", include_date = TRUE, include_force = TRUE) %>% + cat() + +# minimal query: elx_make_query(resource_type = "directive") + +elx_make_query(resource_type = "recommendation", include_date = TRUE, include_lbs = TRUE) %>% + cat() + +# minimal query: elx_make_query(resource_type = "recommendation") + +``` + +You can also decide to not specify any resource types, in which case all types of documents will be returned. As there are over a million documents with a CELEX identifier, this is likely not efficient for a majority of users. But since version 0.3.5 it is possible to request documents belonging to a particular ["sector"](https://eur-lex.europa.eu/content/tools/TableOfSectors/types_of_documents_in_eurlex.html) or [directory code](https://eur-lex.europa.eu/browse/directories/legislation.html). + +```{r} +# request documents from directory 18 ("Common Foreign and Security Policy") +# and sector 3 ("Legal acts") + +elx_make_query(resource_type = "any", + directory = "18", + sector = 3) %>% + cat() +```