Skip to content

Commit

Permalink
Optionally exclude blank cells (#28)
Browse files Browse the repository at this point in the history
For files that are too big to import because whole columns have been formatted
(but are mostly blank) #25
  • Loading branch information
nacnudus authored May 1, 2018
1 parent 3904829 commit 8c872e6
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 31 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
`-F10` (#26 @cablegui).
* No longer misinterprets date formats that use underscores `_` as dates when
the underscore is followed by a date-ish character like `M` (#24).
* Optionally omits blank cells with `include_blank_cells = FALSE` in
`xlsx_cells()` (#25).

# tidyxl 1.0.1

Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

xlsx_cells_ <- function(path, sheet_paths, sheet_names, comments_paths) {
.Call('_tidyxl_xlsx_cells_', PACKAGE = 'tidyxl', path, sheet_paths, sheet_names, comments_paths)
xlsx_cells_ <- function(path, sheet_paths, sheet_names, comments_paths, include_blank_cells) {
.Call('_tidyxl_xlsx_cells_', PACKAGE = 'tidyxl', path, sheet_paths, sheet_names, comments_paths, include_blank_cells)
}

xlsx_formats_ <- function(path) {
Expand Down
2 changes: 1 addition & 1 deletion R/tidy_xlsx.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ tidy_xlsx <- function(path, sheets = NA) {
all_sheets <- utils_xlsx_sheet_files(path)
sheets <- check_sheets(sheets, path)
formats <- xlsx_formats_(path)
cells <- xlsx_cells_(path, sheets$sheet_path, sheets$name, sheets$comments_path)
cells <- xlsx_cells_(path, sheets$sheet_path, sheets$name, sheets$comments_path, include_blank_cells = TRUE)
# Split into a list of data frames, one per sheet
cells$sheet <- factor(cells$sheet, levels = sheets$name) # control sheet order
cells_list <- split(cells, cells$sheet)
Expand Down
10 changes: 8 additions & 2 deletions R/xlsx_cells.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#' @param check_filetype Logical. Whether to check that the filetype is xlsx (or
#' xlsm) by looking at the file itself, rather than using the filename
#' extension.
#' @param include_blank_cells Logical. Whether to include cells that have no
#' value or formula (but might have formatting or comments). Useful when a
#' whole column of cells has been formatted, but most are empty. Try setting
#' this to `FALSE` if a spreadsheet seems too large to load.
#'
#' @return
#' A data frame with the following columns.
Expand Down Expand Up @@ -138,11 +142,13 @@
#' # In-cell formatting is available in the `character_formatted` column as a
#' # data frame, one row per substring.
#' xlsx_cells(examples)$character_formatted[77]
xlsx_cells <- function(path, sheets = NA, check_filetype = TRUE) {
xlsx_cells <- function(path, sheets = NA, check_filetype = TRUE,
include_blank_cells = TRUE) {
path <- check_file(path)
sheets <- check_sheets(sheets, path)
xlsx_cells_(path,
sheets$sheet_path,
sheets$name,
sheets$comments_path)
sheets$comments_path,
include_blank_cells)
}
8 changes: 7 additions & 1 deletion man/xlsx_cells.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@
using namespace Rcpp;

// xlsx_cells_
List xlsx_cells_(std::string path, CharacterVector sheet_paths, CharacterVector sheet_names, CharacterVector comments_paths);
RcppExport SEXP _tidyxl_xlsx_cells_(SEXP pathSEXP, SEXP sheet_pathsSEXP, SEXP sheet_namesSEXP, SEXP comments_pathsSEXP) {
List xlsx_cells_(std::string path, CharacterVector sheet_paths, CharacterVector sheet_names, CharacterVector comments_paths, bool include_blank_cells);
RcppExport SEXP _tidyxl_xlsx_cells_(SEXP pathSEXP, SEXP sheet_pathsSEXP, SEXP sheet_namesSEXP, SEXP comments_pathsSEXP, SEXP include_blank_cellsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type sheet_paths(sheet_pathsSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type sheet_names(sheet_namesSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type comments_paths(comments_pathsSEXP);
rcpp_result_gen = Rcpp::wrap(xlsx_cells_(path, sheet_paths, sheet_names, comments_paths));
Rcpp::traits::input_parameter< bool >::type include_blank_cells(include_blank_cellsSEXP);
rcpp_result_gen = Rcpp::wrap(xlsx_cells_(path, sheet_paths, sheet_names, comments_paths, include_blank_cells));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -100,7 +101,7 @@ END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_tidyxl_xlsx_cells_", (DL_FUNC) &_tidyxl_xlsx_cells_, 4},
{"_tidyxl_xlsx_cells_", (DL_FUNC) &_tidyxl_xlsx_cells_, 5},
{"_tidyxl_xlsx_formats_", (DL_FUNC) &_tidyxl_xlsx_formats_, 1},
{"_tidyxl_xlsx_sheet_files_", (DL_FUNC) &_tidyxl_xlsx_sheet_files_, 1},
{"_tidyxl_xlsx_validation_", (DL_FUNC) &_tidyxl_xlsx_validation_, 3},
Expand Down
6 changes: 4 additions & 2 deletions src/tidyxl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ List xlsx_cells_(
std::string path,
CharacterVector sheet_paths,
CharacterVector sheet_names,
CharacterVector comments_paths
CharacterVector comments_paths,
bool include_blank_cells
) {
xlsxbook book(path, sheet_paths, sheet_names, comments_paths);
xlsxbook book(path, sheet_paths, sheet_names, comments_paths,
include_blank_cells);
return book.information_;
}

Expand Down
9 changes: 5 additions & 4 deletions src/xlsxbook.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ xlsxbook::xlsxbook(
const std::string& path,
CharacterVector& sheet_paths,
CharacterVector& sheet_names,
CharacterVector& comments_paths):
CharacterVector& comments_paths,
const bool& include_blank_cells):
path_(path),
sheet_paths_(sheet_paths),
sheet_names_(sheet_names),
Expand All @@ -43,7 +44,7 @@ xlsxbook::xlsxbook(
createSheets();
countCells();
initializeColumns();
cacheInformation();
cacheInformation(include_blank_cells);
}

// Based on hadley/readxl
Expand Down Expand Up @@ -157,7 +158,7 @@ void xlsxbook::initializeColumns() {
local_format_id_ = IntegerVector(cellcount_, NA_INTEGER);
}

void xlsxbook::cacheInformation() {
void xlsxbook::cacheInformation(const bool& include_blank_cells) {
// Loop through sheets
List sheet_list(sheet_paths_.size());

Expand All @@ -175,7 +176,7 @@ void xlsxbook::cacheInformation() {
doc.parse<rapidxml::parse_strip_xml_namespaces>(&(*xml)[0]);
rapidxml::xml_node<>* workbook = doc.first_node("worksheet");
rapidxml::xml_node<>* sheetData = workbook->first_node("sheetData");
sheet->parseSheetData(sheetData, i);
sheet->parseSheetData(sheetData, i, include_blank_cells);
sheet->appendComments(i);
}

Expand Down
5 changes: 3 additions & 2 deletions src/xlsxbook.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ class xlsxbook {
const std::string& path,
Rcpp::CharacterVector& sheet_names,
Rcpp::CharacterVector& sheet_paths,
Rcpp::CharacterVector& comments_paths
Rcpp::CharacterVector& comments_paths,
const bool& include_blank_cells
);

void cacheStrings();
Expand All @@ -67,7 +68,7 @@ class xlsxbook {
void countCells();
void initializeColumns();
void cacheCells();
void cacheInformation();
void cacheInformation(const bool& include_blank_cells);

};

Expand Down
1 change: 1 addition & 0 deletions src/xlsxcell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ void xlsxcell::cacheValue(
if (v != NULL) {
vvalue = v->value();
} else {
// TODO: don't construct the cell
book.is_blank_[i] = true;
}

Expand Down
49 changes: 37 additions & 12 deletions src/xlsxsheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ void xlsxsheet::cacheComments(String comments_path) {

void xlsxsheet::parseSheetData(
rapidxml::xml_node<>* sheetData,
unsigned long long int& i) {
unsigned long long int& i,
const bool& include_blank_cells) {
// Iterate through rows and cells in sheetData. Cell elements are children
// of row elements. Columns are described elswhere in cols->col.
rowHeights_.assign(1048576, defaultRowHeight_); // cache rowHeight while here
Expand All @@ -160,19 +161,43 @@ void xlsxsheet::parseSheetData(
rowHeights_[rowNumber - 1] = rowHeight;
}

for (rapidxml::xml_node<>* c = row->first_node();
c; c = c->next_sibling()) {
xlsxcell cell(c, this, book_, i);
if (include_blank_cells) {
for (rapidxml::xml_node<>* c = row->first_node();
c; c = c->next_sibling()) {
xlsxcell cell(c, this, book_, i);

// Sheet name, row height and col width aren't really determined by the
// cell, so they're done in this sheet instance
book_.sheet_[i] = name_;
book_.height_[i] = rowHeight;
book_.width_[i] = colWidths_[book_.col_[i] - 1];
// Sheet name, row height and col width aren't really determined by
// the cell, so they're done in this sheet instance
book_.sheet_[i] = name_;
book_.height_[i] = rowHeight;
book_.width_[i] = colWidths_[book_.col_[i] - 1];

++i;
if ((i + 1) % 1000 == 0)
checkUserInterrupt();
++i;
if ((i + 1) % 1000 == 0)
checkUserInterrupt();
}
} else {
for (rapidxml::xml_node<>* c = row->first_node();
c; c = c->next_sibling()) {
// If cell has no child nodes then it is empty (no value or formula)
// besides maybe formatting (linked to via attributes not child nodes).
rapidxml::xml_node<>* first_child = c->first_node();
if (first_child != NULL) {
xlsxcell cell(c, this, book_, i);

// TODO: check readxl's method of importing ranges

// Sheet name, row height and col width aren't really determined by
// the cell, so they're done in this sheet instance
book_.sheet_[i] = name_;
book_.height_[i] = rowHeight;
book_.width_[i] = colWidths_[book_.col_[i] - 1];

++i;
if ((i + 1) % 1000 == 0)
checkUserInterrupt();
}
}
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/xlsxsheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ class xlsxsheet {
void cacheComments(Rcpp::String comments_path);
void parseSheetData(
rapidxml::xml_node<>* sheetData,
unsigned long long int& i);
unsigned long long int& i,
const bool& include_blank_cells);
void appendComments(unsigned long long int& i);

};
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/test-xlsx_cells.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,11 @@ test_that("array formulas are detected as such", {
expect_equal(cells$is_array[43], TRUE)
expect_equal(cells$is_array[45], TRUE)
})

test_that("include_blank_cells works", {
cells <- xlsx_cells("./examples.xlsx", include_blank_cells = FALSE)
blanks <- cells[cells$is_blank, ]
non_blanks <- cells[!cells$is_blank, ]
expect_equal(nrow(blanks), 0L)
expect_gt(nrow(non_blanks), 0L)
})

0 comments on commit 8c872e6

Please sign in to comment.