From f7d2485c9a52600125abf428f85b61dac016b28e Mon Sep 17 00:00:00 2001 From: DavZim Date: Sun, 25 Aug 2024 11:45:14 +0200 Subject: [PATCH] fix read for large files, should solve #30 --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ debug/debug_tools.cpp | 12 ++++++------ src/count_messages.cpp | 15 +++++++++++---- src/filter_itch.cpp | 15 +++++++++++---- src/read_functions.cpp | 15 +++++++++++---- 6 files changed, 44 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 693f0f4..45996d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RITCH Type: Package Title: R Parser for the ITCH-Protocol -Version: 0.1.26 +Version: 0.1.27 Authors@R: c( person("David", "Zimmermann-Kollenda", , "david_j_zimmermann@hotmail.com", role = c("aut", "cre")) ) diff --git a/NEWS.md b/NEWS.md index b816651..52382bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# RITCH 0.1.76 + +* fix bug where no messages would be reported for larger files + # RITCH 0.1.26 * fix bug where gz functionality would write to user library or current directory diff --git a/debug/debug_tools.cpp b/debug/debug_tools.cpp index dad8ea5..6894458 100644 --- a/debug/debug_tools.cpp +++ b/debug/debug_tools.cpp @@ -118,7 +118,7 @@ void dbg_itch_file(std::string filename = "inst/extdata/ex20101224.TEST_ITCH_50" Rprintf("Number of Messages:\n"); for (int j = 0; j < N_ACT_MSGS; j++) { - Rprintf("- '%c': %i\n", ACT_MSG_NAMES[j], counts[j]); + Rprintf("- '%c': %ld\n", ACT_MSG_NAMES[j], counts[j]); } Rprintf("=============================\n"); // Use the Buffer @@ -154,7 +154,7 @@ void dbg_itch_file(std::string filename = "inst/extdata/ex20101224.TEST_ITCH_50" } } - Rprintf("'%c' (len 2 + %i) idx %4i at offset %5i (0x%04x) | ", num, l - 2, i, idx, idx); + Rprintf("'%c' (len 2 + %i) idx %4i at offset %5ld (0x%04lx) | ", num, l - 2, i, idx, idx); Rprintf("(%02x %02x) ", bufferPtr[idx], bufferPtr[idx + 1]); for (int x = 2; x < l; x++) Rprintf("%02x ", bufferPtr[idx + x]); Rprintf("\n"); @@ -237,14 +237,14 @@ dbg_hex_to_dbl <- function(h, prec = 4) { */ // converts a std::string of hex values to a buffer -char * to_buffer(std::string x) { +unsigned char * to_buffer(std::string x) { x.erase(remove_if(x.begin(), x.end(), isspace), x.end()); const uint64_t n_bytes = x.size() / 2; unsigned char * buf; // Rprintf("Found %u bytes\n", x.size() / 2); buf = (unsigned char*) calloc(x.size() / 2, sizeof(unsigned char)); - for (int j = 0; j < n_bytes; j++) + for (uint64_t j = 0; j < n_bytes; j++) buf[j] = std::stoul(x.substr(j * 2, 2), nullptr, 16); return buf; } @@ -263,7 +263,7 @@ Rcpp::DataFrame hex_count_messages_impl(std::string x) { std::vector count = count_messages_buffer(buf, n_bytes); Rcpp::StringVector types; - for (unsigned char c : ACT_MSG_NAMES) types.push_back(c); + for (unsigned char c : ACT_MSG_NAMES) types.push_back(std::string(1, c)); Rcpp::List df(2); df.names() = Rcpp::CharacterVector::create("msg_type", "count"); @@ -322,7 +322,7 @@ Rcpp::DataFrame dbg_hex_to_df(std::string x, std::string msg_class) { MessageParser mp(msg_class, 0, 100); // take max 100 messages... mp.activate(); mp.init_vectors(n_messages + 100); - int64_t i = 2; + uint64_t i = 2; while (i < n_bytes) { mp.parse_message(&buf[i]); diff --git a/src/count_messages.cpp b/src/count_messages.cpp index 04d85d5..54b3e0f 100644 --- a/src/count_messages.cpp +++ b/src/count_messages.cpp @@ -12,9 +12,16 @@ std::vector count_messages_internal(std::string filename, } // get size of the file - fseek(infile, 0L, SEEK_END); - int64_t filesize = ftell(infile); - fseek(infile, 0L, SEEK_SET); + if (fseeko64(infile, 0L, SEEK_END) != 0) { + Rcpp::stop("Error seeking to end of file"); + } + int64_t filesize = ftello64(infile); + if (filesize == -1) { + Rcpp::stop("Error getting file size"); + } + if (fseeko64(infile, 0L, SEEK_SET) != 0) { + Rcpp::stop("Error seeking back to start of file"); + } // create buffer int64_t buf_size = max_buffer_size > filesize ? filesize : max_buffer_size; @@ -42,7 +49,7 @@ std::vector count_messages_internal(std::string filename, // align the file pointer to read in a full message again const int64_t offset = i - this_buffer_size; - fseek(infile, offset, SEEK_CUR); + fseeko64(infile, offset, SEEK_CUR); bytes_read += i; } diff --git a/src/filter_itch.cpp b/src/filter_itch.cpp index dc94059..c892230 100644 --- a/src/filter_itch.cpp +++ b/src/filter_itch.cpp @@ -64,9 +64,16 @@ void filter_itch_impl(std::string infile, std::string outfile, } // get size of the file - fseek(ifile, 0L, SEEK_END); - int64_t filesize = ftell(ifile); - fseek(ifile, 0L, SEEK_SET); + if (fseeko64(ifile, 0L, SEEK_END) != 0) { + Rcpp::stop("Error seeking to end of file"); + } + int64_t filesize = ftello64(ifile); + if (filesize == -1) { + Rcpp::stop("Error getting file size"); + } + if (fseeko64(ifile, 0L, SEEK_SET) != 0) { + Rcpp::stop("Error seeking back to start of file"); + } // create buffer int64_t buf_size = max_buffer_size > filesize ? filesize : max_buffer_size; @@ -153,7 +160,7 @@ void filter_itch_impl(std::string infile, std::string outfile, const int64_t offset = i - this_buffer_size; // Rprintf("Filter ibuf at %6lld offsetting by %3lld - Total bytes read %lld\n", // i, offset, bytes_read + i); - fseek(ifile, offset, SEEK_CUR); + fseeko64(ifile, offset, SEEK_CUR); bytes_read += i; } diff --git a/src/read_functions.cpp b/src/read_functions.cpp index fa26e26..dd9767f 100644 --- a/src/read_functions.cpp +++ b/src/read_functions.cpp @@ -99,9 +99,16 @@ Rcpp::List read_itch_impl(std::vector classes, } // get size of the file - fseek(infile, 0L, SEEK_END); - int64_t filesize = ftell(infile); - fseek(infile, 0L, SEEK_SET); + if (fseeko64(infile, 0L, SEEK_END) != 0) { + Rcpp::stop("Error seeking to end of file"); + } + int64_t filesize = ftello64(infile); + if (filesize == -1) { + Rcpp::stop("Error getting file size"); + } + if (fseeko64(infile, 0L, SEEK_SET) != 0) { + Rcpp::stop("Error seeking back to start of file"); + } // create buffer int64_t buf_size = max_buffer_size > filesize ? filesize : max_buffer_size; @@ -161,7 +168,7 @@ Rcpp::List read_itch_impl(std::vector classes, // offset file pointer to fit the next message into the buffer const int64_t offset = i - this_buffer_size; - fseek(infile, offset, SEEK_CUR); + fseeko64(infile, offset, SEEK_CUR); bytes_read += i; }