From 6bbf4d6e315fc2cea6eefc295f31e6972b176298 Mon Sep 17 00:00:00 2001 From: Matt Cowgill Date: Mon, 14 Jan 2019 14:56:21 +1100 Subject: [PATCH] added 'tables' argument to get_abs(), with ability to specify particular table(s) to download --- DESCRIPTION | 2 +- LICENSE | 2 +- R/get_abs_xml_metadata.R | 13 ++++++++++--- R/get_xml_df.R | 21 ++++++++++++++++++--- R/read_abs.R | 14 ++++++++++++-- 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e3b4343..69cdaed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Authors@R: c( Maintainer: Matt Cowgill Description: Downloads, imports, and tidies time series data from the Australian Bureau of Statistics . -Date: 2018-05-30 +Date: 2019-01-14 License: MIT + file LICENSE Encoding: UTF-8 LazyData: true diff --git a/LICENSE b/LICENSE index ad76497..aa599d9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ YEAR:2018 -COPYRIGHT HOLDER: Zoe Meers +COPYRIGHT HOLDER: Zoe Meers and Matt Cowgill diff --git a/R/get_abs_xml_metadata.R b/R/get_abs_xml_metadata.R index 7df6ae2..9796f17 100644 --- a/R/get_abs_xml_metadata.R +++ b/R/get_abs_xml_metadata.R @@ -5,14 +5,21 @@ # given a catalogue number, download the catalogue metadata via XML, then find # unique filenames in the latest release and return those -get_abs_xml_metadata <- function(cat_no) { +get_abs_xml_metadata <- function(cat_no, table) { ProductReleaseDate=TableOrder=text=NULL + if(table == "all"){ + tables_url <- "" + } else { + tables_url <- paste0("&ttitle=", table) + } + # Download the first page of metadata for cat_no first_url <- paste0("http://ausstats.abs.gov.au/servlet/TSSearchServlet?catno=", cat_no, - "&pg=1") + "&pg=1", + tables_url) first_page <- XML::xmlParse(file = first_url) @@ -47,7 +54,7 @@ get_abs_xml_metadata <- function(cat_no) { xml_dfs <- list() while(current == TRUE){ - xml_df <- get_xml_df(cat_no = cat_no, metadata_page = all_pages[i]) + xml_df <- get_xml_df(cat_no = cat_no, table = table, metadata_page = all_pages[i]) xml_dfs[[i]] <- xml_df diff --git a/R/get_xml_df.R b/R/get_xml_df.R index 0bc4aef..d940c8c 100644 --- a/R/get_xml_df.R +++ b/R/get_xml_df.R @@ -2,15 +2,30 @@ #' @importFrom XML xmlParse xmlToDataFrame #' @importFrom dplyr filter select "%>%" -get_xml_df <- function(cat_no, metadata_page){ +get_xml_df <- function(cat_no, table, metadata_page){ text=NULL + if(table == "all"){ + tables_url <- "" + } else { + tables_url <- paste0("&ttitle=", table) + } + base_url <- "http://ausstats.abs.gov.au/servlet/TSSearchServlet?catno=" - url <- paste0(base_url, cat_no, "&pg=", metadata_page) + url <- paste0(base_url, cat_no, "&pg=", metadata_page, tables_url) + + safe_parse <- purrr::safely(XML::xmlParse) - xml_page <- XML::xmlParse(file = url) + xml_page <- safe_parse(file = url) + + if(is.null(xml_page$error)){ + xml_page <- xml_page$result + } else { + stop(paste0("Error: the following URL does not contain valid ABS metadata:\n", + url)) + } xml_df <- XML::xmlToDataFrame(xml_page, stringsAsFactors = FALSE) diff --git a/R/read_abs.R b/R/read_abs.R index 93d9e77..163332d 100644 --- a/R/read_abs.R +++ b/R/read_abs.R @@ -8,6 +8,10 @@ #' @param cat_no ABS catalogue number, as a string, including the extension. #' For example, "6202.0". #' +#' @param tables Time series tables in `cat_no`` to download and extract. Default is "all", +#' which will read all time series in `cat_no`. Specify `tables` to +#' download and import specific tables(s) - eg. `tables = 1` or `tables = c(1, 5)`. +#' #' @param path Local directory in which to save downloaded ABS time series #' spreadsheets. Default is "data/ABS"; this subdirectory of your working #' directory will be created if it does not exist. @@ -24,11 +28,12 @@ #' #' \donttest{wpi <- read_abs("6345.0")} #' -#' @importFrom purrr walk walk2 map +#' @importFrom purrr walk walk2 map map2_dfr #' @name read_abs #' @export read_abs <- function(cat_no = NULL, + tables = "all", path = "data/ABS", show_progress_bars = TRUE){ @@ -44,9 +49,14 @@ read_abs <- function(cat_no = NULL, stop("Please ensure you include the cat_no extension, eg. '6202.0', not '6202'") } + if(is.null(tables)){ + message(paste0("`tables` not specified; attempting to fetch all tables from ", cat_no)) + } + # find URLs from cat_no message(paste0("Finding filenames for tables from ABS catalogue ", cat_no)) - xml_dfs <- get_abs_xml_metadata(cat_no = cat_no) + xml_dfs <- purrr::map2_dfr(cat_no, tables, + .f = get_abs_xml_metadata) urls <- unique(xml_dfs$TableURL) urls <- gsub(".test", "", urls)