added 'tables' argument to get_abs(), with ability to specify particu…

…lar table(s) to download
Annabel-Kennedy · Jan 14, 2019 · 6bbf4d6 · 6bbf4d6
1 parent 2dca695
commit 6bbf4d6
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 10 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Authors@R: c(
 Maintainer: Matt Cowgill <[email protected]>
 Description: Downloads, imports, and tidies time series data from the 
     Australian Bureau of Statistics <https://www.abs.gov.au/>.
-Date: 2018-05-30
+Date: 2019-01-14
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true

diff --git a/LICENSE b/LICENSE
@@ -1,2 +1,2 @@
 YEAR:2018
-COPYRIGHT HOLDER: Zoe Meers
+COPYRIGHT HOLDER: Zoe Meers and Matt Cowgill
diff --git a/R/get_abs_xml_metadata.R b/R/get_abs_xml_metadata.R
@@ -5,14 +5,21 @@
 # given a catalogue number, download the catalogue metadata via XML, then find
 # unique filenames in the latest release and return those
 
-get_abs_xml_metadata <- function(cat_no) {
+get_abs_xml_metadata <- function(cat_no, table) {
 
   ProductReleaseDate=TableOrder=text=NULL
 
+  if(table == "all"){
+    tables_url <- ""
+  } else {
+    tables_url <- paste0("&ttitle=", table)
+  }
+
   # Download the first page of metadata for cat_no
   first_url <- paste0("http://ausstats.abs.gov.au/servlet/TSSearchServlet?catno=",
                       cat_no,
-                      "&pg=1")
+                      "&pg=1",
+                      tables_url)
 
   first_page <- XML::xmlParse(file = first_url)
 
@@ -47,7 +54,7 @@ get_abs_xml_metadata <- function(cat_no) {
   xml_dfs <- list()
   while(current == TRUE){
 
-    xml_df <- get_xml_df(cat_no = cat_no, metadata_page = all_pages[i])
+    xml_df <- get_xml_df(cat_no = cat_no, table = table, metadata_page = all_pages[i])
 
     xml_dfs[[i]] <- xml_df
 

diff --git a/R/get_xml_df.R b/R/get_xml_df.R
@@ -2,15 +2,30 @@
 #' @importFrom XML xmlParse xmlToDataFrame
 #' @importFrom dplyr filter select "%>%"
 
-get_xml_df <- function(cat_no, metadata_page){
+get_xml_df <- function(cat_no, table, metadata_page){
 
   text=NULL
 
+  if(table == "all"){
+    tables_url <- ""
+  } else {
+    tables_url <- paste0("&ttitle=", table)
+  }
+
   base_url <- "http://ausstats.abs.gov.au/servlet/TSSearchServlet?catno="
 
-  url <- paste0(base_url, cat_no, "&pg=", metadata_page)
+  url <- paste0(base_url, cat_no, "&pg=", metadata_page, tables_url)
+
+  safe_parse <- purrr::safely(XML::xmlParse)
 
-  xml_page <- XML::xmlParse(file = url)
+  xml_page <- safe_parse(file = url)
+
+  if(is.null(xml_page$error)){
+    xml_page <- xml_page$result
+  } else {
+    stop(paste0("Error: the following URL does not contain valid ABS metadata:\n",
+                url))
+  }
 
   xml_df <- XML::xmlToDataFrame(xml_page, stringsAsFactors = FALSE)
 

diff --git a/R/read_abs.R b/R/read_abs.R
@@ -8,6 +8,10 @@
 #' @param cat_no ABS catalogue number, as a string, including the extension.
 #' For example, "6202.0".
 #'
+#' @param tables Time series tables in `cat_no`` to download and extract. Default is "all",
+#' which will read all time series in `cat_no`. Specify `tables` to
+#' download and import specific tables(s) - eg. `tables = 1` or `tables = c(1, 5)`.
+#'
 #' @param path Local directory in which to save downloaded ABS time series
 #' spreadsheets. Default is "data/ABS"; this subdirectory of your working
 #' directory will be created if it does not exist.
@@ -24,11 +28,12 @@
 #'
 #' \donttest{wpi <- read_abs("6345.0")}
 #'
-#' @importFrom purrr walk walk2 map
+#' @importFrom purrr walk walk2 map map2_dfr
 #' @name read_abs
 #' @export
 
 read_abs <- function(cat_no = NULL,
+                     tables = "all",
                      path = "data/ABS",
                      show_progress_bars = TRUE){
 
@@ -44,9 +49,14 @@ read_abs <- function(cat_no = NULL,
     stop("Please ensure you include the cat_no extension, eg. '6202.0', not '6202'")
   }
 
+  if(is.null(tables)){
+    message(paste0("`tables` not specified; attempting to fetch all tables from ", cat_no))
+  }
+
   # find URLs from cat_no
   message(paste0("Finding filenames for tables from ABS catalogue ", cat_no))
-  xml_dfs <- get_abs_xml_metadata(cat_no = cat_no)
+  xml_dfs <- purrr::map2_dfr(cat_no, tables,
+                             .f = get_abs_xml_metadata)
 
   urls <- unique(xml_dfs$TableURL)
   urls <- gsub(".test", "", urls)