diff --git a/NEWS.md b/NEWS.md index 4ddb742..cba74c4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ `-F10` (#26 @cablegui). * No longer misinterprets date formats that use underscores `_` as dates when the underscore is followed by a date-ish character like `M` (#24). +* Optionally omits blank cells with `include_blank_cells = FALSE` in + `xlsx_cells()` (#25). # tidyxl 1.0.1 diff --git a/R/RcppExports.R b/R/RcppExports.R index 3eebe27..fbef21c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -xlsx_cells_ <- function(path, sheet_paths, sheet_names, comments_paths) { - .Call('_tidyxl_xlsx_cells_', PACKAGE = 'tidyxl', path, sheet_paths, sheet_names, comments_paths) +xlsx_cells_ <- function(path, sheet_paths, sheet_names, comments_paths, include_blank_cells) { + .Call('_tidyxl_xlsx_cells_', PACKAGE = 'tidyxl', path, sheet_paths, sheet_names, comments_paths, include_blank_cells) } xlsx_formats_ <- function(path) { diff --git a/R/tidy_xlsx.R b/R/tidy_xlsx.R index 651dc80..43b5454 100644 --- a/R/tidy_xlsx.R +++ b/R/tidy_xlsx.R @@ -193,7 +193,7 @@ tidy_xlsx <- function(path, sheets = NA) { all_sheets <- utils_xlsx_sheet_files(path) sheets <- check_sheets(sheets, path) formats <- xlsx_formats_(path) - cells <- xlsx_cells_(path, sheets$sheet_path, sheets$name, sheets$comments_path) + cells <- xlsx_cells_(path, sheets$sheet_path, sheets$name, sheets$comments_path, include_blank_cells = TRUE) # Split into a list of data frames, one per sheet cells$sheet <- factor(cells$sheet, levels = sheets$name) # control sheet order cells_list <- split(cells, cells$sheet) diff --git a/R/xlsx_cells.R b/R/xlsx_cells.R index f4ffca3..c0ec014 100644 --- a/R/xlsx_cells.R +++ b/R/xlsx_cells.R @@ -13,6 +13,10 @@ #' @param check_filetype Logical. Whether to check that the filetype is xlsx (or #' xlsm) by looking at the file itself, rather than using the filename #' extension. +#' @param include_blank_cells Logical. Whether to include cells that have no +#' value or formula (but might have formatting or comments). Useful when a +#' whole column of cells has been formatted, but most are empty. Try setting +#' this to `FALSE` if a spreadsheet seems too large to load. #' #' @return #' A data frame with the following columns. @@ -138,11 +142,13 @@ #' # In-cell formatting is available in the `character_formatted` column as a #' # data frame, one row per substring. #' xlsx_cells(examples)$character_formatted[77] -xlsx_cells <- function(path, sheets = NA, check_filetype = TRUE) { +xlsx_cells <- function(path, sheets = NA, check_filetype = TRUE, + include_blank_cells = TRUE) { path <- check_file(path) sheets <- check_sheets(sheets, path) xlsx_cells_(path, sheets$sheet_path, sheets$name, - sheets$comments_path) + sheets$comments_path, + include_blank_cells) } diff --git a/man/xlsx_cells.Rd b/man/xlsx_cells.Rd index 1aec41d..3336795 100644 --- a/man/xlsx_cells.Rd +++ b/man/xlsx_cells.Rd @@ -4,7 +4,8 @@ \alias{xlsx_cells} \title{Import xlsx (Excel) cell contents into a tidy structure.} \usage{ -xlsx_cells(path, sheets = NA, check_filetype = TRUE) +xlsx_cells(path, sheets = NA, check_filetype = TRUE, + include_blank_cells = TRUE) } \arguments{ \item{path}{Path to the xlsx file.} @@ -16,6 +17,11 @@ sheets).} \item{check_filetype}{Logical. Whether to check that the filetype is xlsx (or xlsm) by looking at the file itself, rather than using the filename extension.} + +\item{include_blank_cells}{Logical. Whether to include cells that have no +value or formula (but might have formatting or comments). Useful when a +whole column of cells has been formatted, but most are empty. Try setting +this to \code{FALSE} if a spreadsheet seems too large to load.} } \value{ A data frame with the following columns. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 62f1fbe..3000fd0 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,8 +6,8 @@ using namespace Rcpp; // xlsx_cells_ -List xlsx_cells_(std::string path, CharacterVector sheet_paths, CharacterVector sheet_names, CharacterVector comments_paths); -RcppExport SEXP _tidyxl_xlsx_cells_(SEXP pathSEXP, SEXP sheet_pathsSEXP, SEXP sheet_namesSEXP, SEXP comments_pathsSEXP) { +List xlsx_cells_(std::string path, CharacterVector sheet_paths, CharacterVector sheet_names, CharacterVector comments_paths, bool include_blank_cells); +RcppExport SEXP _tidyxl_xlsx_cells_(SEXP pathSEXP, SEXP sheet_pathsSEXP, SEXP sheet_namesSEXP, SEXP comments_pathsSEXP, SEXP include_blank_cellsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -15,7 +15,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< CharacterVector >::type sheet_paths(sheet_pathsSEXP); Rcpp::traits::input_parameter< CharacterVector >::type sheet_names(sheet_namesSEXP); Rcpp::traits::input_parameter< CharacterVector >::type comments_paths(comments_pathsSEXP); - rcpp_result_gen = Rcpp::wrap(xlsx_cells_(path, sheet_paths, sheet_names, comments_paths)); + Rcpp::traits::input_parameter< bool >::type include_blank_cells(include_blank_cellsSEXP); + rcpp_result_gen = Rcpp::wrap(xlsx_cells_(path, sheet_paths, sheet_names, comments_paths, include_blank_cells)); return rcpp_result_gen; END_RCPP } @@ -100,7 +101,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_tidyxl_xlsx_cells_", (DL_FUNC) &_tidyxl_xlsx_cells_, 4}, + {"_tidyxl_xlsx_cells_", (DL_FUNC) &_tidyxl_xlsx_cells_, 5}, {"_tidyxl_xlsx_formats_", (DL_FUNC) &_tidyxl_xlsx_formats_, 1}, {"_tidyxl_xlsx_sheet_files_", (DL_FUNC) &_tidyxl_xlsx_sheet_files_, 1}, {"_tidyxl_xlsx_validation_", (DL_FUNC) &_tidyxl_xlsx_validation_, 3}, diff --git a/src/tidyxl.cpp b/src/tidyxl.cpp index 5d19352..c95c974 100644 --- a/src/tidyxl.cpp +++ b/src/tidyxl.cpp @@ -21,9 +21,11 @@ List xlsx_cells_( std::string path, CharacterVector sheet_paths, CharacterVector sheet_names, - CharacterVector comments_paths + CharacterVector comments_paths, + bool include_blank_cells ) { - xlsxbook book(path, sheet_paths, sheet_names, comments_paths); + xlsxbook book(path, sheet_paths, sheet_names, comments_paths, + include_blank_cells); return book.information_; } diff --git a/src/xlsxbook.cpp b/src/xlsxbook.cpp index 8832bee..31134f9 100644 --- a/src/xlsxbook.cpp +++ b/src/xlsxbook.cpp @@ -24,7 +24,8 @@ xlsxbook::xlsxbook( const std::string& path, CharacterVector& sheet_paths, CharacterVector& sheet_names, - CharacterVector& comments_paths): + CharacterVector& comments_paths, + const bool& include_blank_cells): path_(path), sheet_paths_(sheet_paths), sheet_names_(sheet_names), @@ -43,7 +44,7 @@ xlsxbook::xlsxbook( createSheets(); countCells(); initializeColumns(); - cacheInformation(); + cacheInformation(include_blank_cells); } // Based on hadley/readxl @@ -157,7 +158,7 @@ void xlsxbook::initializeColumns() { local_format_id_ = IntegerVector(cellcount_, NA_INTEGER); } -void xlsxbook::cacheInformation() { +void xlsxbook::cacheInformation(const bool& include_blank_cells) { // Loop through sheets List sheet_list(sheet_paths_.size()); @@ -175,7 +176,7 @@ void xlsxbook::cacheInformation() { doc.parse(&(*xml)[0]); rapidxml::xml_node<>* workbook = doc.first_node("worksheet"); rapidxml::xml_node<>* sheetData = workbook->first_node("sheetData"); - sheet->parseSheetData(sheetData, i); + sheet->parseSheetData(sheetData, i, include_blank_cells); sheet->appendComments(i); } diff --git a/src/xlsxbook.h b/src/xlsxbook.h index f89ef00..224e4c7 100644 --- a/src/xlsxbook.h +++ b/src/xlsxbook.h @@ -57,7 +57,8 @@ class xlsxbook { const std::string& path, Rcpp::CharacterVector& sheet_names, Rcpp::CharacterVector& sheet_paths, - Rcpp::CharacterVector& comments_paths + Rcpp::CharacterVector& comments_paths, + const bool& include_blank_cells ); void cacheStrings(); @@ -67,7 +68,7 @@ class xlsxbook { void countCells(); void initializeColumns(); void cacheCells(); - void cacheInformation(); + void cacheInformation(const bool& include_blank_cells); }; diff --git a/src/xlsxcell.cpp b/src/xlsxcell.cpp index 96ee002..c287334 100644 --- a/src/xlsxcell.cpp +++ b/src/xlsxcell.cpp @@ -68,6 +68,7 @@ void xlsxcell::cacheValue( if (v != NULL) { vvalue = v->value(); } else { + // TODO: don't construct the cell book.is_blank_[i] = true; } diff --git a/src/xlsxsheet.cpp b/src/xlsxsheet.cpp index a25879e..e7c70b1 100644 --- a/src/xlsxsheet.cpp +++ b/src/xlsxsheet.cpp @@ -141,7 +141,8 @@ void xlsxsheet::cacheComments(String comments_path) { void xlsxsheet::parseSheetData( rapidxml::xml_node<>* sheetData, - unsigned long long int& i) { + unsigned long long int& i, + const bool& include_blank_cells) { // Iterate through rows and cells in sheetData. Cell elements are children // of row elements. Columns are described elswhere in cols->col. rowHeights_.assign(1048576, defaultRowHeight_); // cache rowHeight while here @@ -160,19 +161,43 @@ void xlsxsheet::parseSheetData( rowHeights_[rowNumber - 1] = rowHeight; } - for (rapidxml::xml_node<>* c = row->first_node(); - c; c = c->next_sibling()) { - xlsxcell cell(c, this, book_, i); + if (include_blank_cells) { + for (rapidxml::xml_node<>* c = row->first_node(); + c; c = c->next_sibling()) { + xlsxcell cell(c, this, book_, i); - // Sheet name, row height and col width aren't really determined by the - // cell, so they're done in this sheet instance - book_.sheet_[i] = name_; - book_.height_[i] = rowHeight; - book_.width_[i] = colWidths_[book_.col_[i] - 1]; + // Sheet name, row height and col width aren't really determined by + // the cell, so they're done in this sheet instance + book_.sheet_[i] = name_; + book_.height_[i] = rowHeight; + book_.width_[i] = colWidths_[book_.col_[i] - 1]; - ++i; - if ((i + 1) % 1000 == 0) - checkUserInterrupt(); + ++i; + if ((i + 1) % 1000 == 0) + checkUserInterrupt(); + } + } else { + for (rapidxml::xml_node<>* c = row->first_node(); + c; c = c->next_sibling()) { + // If cell has no child nodes then it is empty (no value or formula) + // besides maybe formatting (linked to via attributes not child nodes). + rapidxml::xml_node<>* first_child = c->first_node(); + if (first_child != NULL) { + xlsxcell cell(c, this, book_, i); + + // TODO: check readxl's method of importing ranges + + // Sheet name, row height and col width aren't really determined by + // the cell, so they're done in this sheet instance + book_.sheet_[i] = name_; + book_.height_[i] = rowHeight; + book_.width_[i] = colWidths_[book_.col_[i] - 1]; + + ++i; + if ((i + 1) % 1000 == 0) + checkUserInterrupt(); + } + } } } } diff --git a/src/xlsxsheet.h b/src/xlsxsheet.h index 867ee6b..8cad0fc 100644 --- a/src/xlsxsheet.h +++ b/src/xlsxsheet.h @@ -35,7 +35,8 @@ class xlsxsheet { void cacheComments(Rcpp::String comments_path); void parseSheetData( rapidxml::xml_node<>* sheetData, - unsigned long long int& i); + unsigned long long int& i, + const bool& include_blank_cells); void appendComments(unsigned long long int& i); }; diff --git a/tests/testthat/test-xlsx_cells.R b/tests/testthat/test-xlsx_cells.R index 44bd337..26c9649 100644 --- a/tests/testthat/test-xlsx_cells.R +++ b/tests/testthat/test-xlsx_cells.R @@ -39,3 +39,11 @@ test_that("array formulas are detected as such", { expect_equal(cells$is_array[43], TRUE) expect_equal(cells$is_array[45], TRUE) }) + +test_that("include_blank_cells works", { + cells <- xlsx_cells("./examples.xlsx", include_blank_cells = FALSE) + blanks <- cells[cells$is_blank, ] + non_blanks <- cells[!cells$is_blank, ] + expect_equal(nrow(blanks), 0L) + expect_gt(nrow(non_blanks), 0L) +})