diff --git a/NEWS.md b/NEWS.md index 7c09feb..0dbced3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# eurlex 0.3.3 + +## Minor changes + +- hotfix for critical bug in xml parsing that scrambled column with legal basis where this was requested + # eurlex 0.3.2 ## Major changes diff --git a/R/elx_make_query.R b/R/elx_make_query.R index 7b1a3c0..bc5a0e3 100644 --- a/R/elx_make_query.R +++ b/R/elx_make_query.R @@ -155,8 +155,7 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision" ?type=|| ?type=|| ?type=|| - ?type=|| - ?type=)", sep = " ") + ?type=)", sep = " ") } if (resource_type == "caselaw"){ diff --git a/R/elx_parse_xml.R b/R/elx_parse_xml.R index df8e22b..cf03b46 100644 --- a/R/elx_parse_xml.R +++ b/R/elx_parse_xml.R @@ -19,9 +19,16 @@ elx_parse_xml <- function(sparql_response = ""){ res_cols <- res_binding %>% xml2::xml_attr("name") - out <- data.frame(res_cols, res_text, stringsAsFactors = FALSE) %>% - dplyr::group_by(res_cols) %>% - dplyr::mutate(triplet = dplyr::row_number()) %>% + unique(res_cols) + + out <- dplyr::tibble(res_cols, res_text) %>% + dplyr::mutate(is_work = ifelse(res_cols=="work", T, NA)) %>% + dplyr::group_by(is_work) %>% + dplyr::mutate(triplet = dplyr::row_number(), + triplet = ifelse(is_work==T, triplet, NA)) %>% + dplyr::ungroup() %>% + tidyr::fill(triplet) %>% + dplyr::select(-.data$is_work) %>% tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>% dplyr::select(-.data$triplet) diff --git a/README.md b/README.md index c602e14..0fbcff3 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ For the moment, it is recommended to retrieve metadata one variable at a time. F 2. `dates <- elx_make_query("directive", include_date_transpos = TRUE) %>% elx_run_query()` 3. `ids %>% dplyr::left_join(lbs) %>% dplyr::left_join(dates)` -rather than `elx_make_query("directive", include_lbs = TRUE, include_date_transpos = TRUE)`. The reason is that observations with missing data on any variable are currently dropped entirely when cumulating variable requests. By separating the calls, you are able to at least identify the missing data. +rather than `elx_make_query("directive", include_lbs = TRUE, include_date_transpos = TRUE)`. The reason is that rows with missing data on any variable are currently dropped entirely when cumulating variable requests. By separating the calls, you are able to identify the missing data, while retaining data from other columns. One of the main contributions of the SPARQL requests is that we obtain a comprehensive list of identifiers that we can subsequently use to obtain more data relating to the document in question. While the results of the SPARQL queries are useful also for webscraping (with the `rvest` package), the function `elx_fetch_data()` enables us to fire GET requests to retrieve data on documents with known identifiers (including Cellar URI). The function currently enables downloading the title and the full text of a document in all available languages. diff --git a/eurlex.Rproj b/eurlex.Rproj new file mode 100644 index 0000000..b9255bc --- /dev/null +++ b/eurlex.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source