From 55c578adf5ef5af06138354e44be81954a7946ae Mon Sep 17 00:00:00 2001 From: Leonardo Alminana Date: Wed, 3 Apr 2024 17:35:07 +0200 Subject: [PATCH] file: fixed the content length inference approach Signed-off-by: Leonardo Alminana --- include/chunkio/cio_file.h | 1 + include/chunkio/cio_file_st.h | 65 ++++++++++++++++++++--------------- src/cio_file.c | 13 +++++-- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/include/chunkio/cio_file.h b/include/chunkio/cio_file.h index 7d44749..3bc6591 100644 --- a/include/chunkio/cio_file.h +++ b/include/chunkio/cio_file.h @@ -44,6 +44,7 @@ struct cio_file { HANDLE backing_file; HANDLE backing_mapping; #endif + int taint_flag; /* content modification flag */ /* cached addr */ char *st_content; crc_t crc_cur; /* crc: current value calculated */ diff --git a/include/chunkio/cio_file_st.h b/include/chunkio/cio_file_st.h index 4b1552b..231cb58 100644 --- a/include/chunkio/cio_file_st.h +++ b/include/chunkio/cio_file_st.h @@ -112,17 +112,36 @@ static inline ssize_t cio_file_st_infer_content_len(char *map, size_t size) return content_length; } +/* Set content length */ +static inline void cio_file_st_set_content_len(char *map, uint32_t len) +{ + uint8_t *content_length_buffer; + + content_length_buffer = (uint8_t *) &map[CIO_FILE_CONTENT_LENGTH_OFFSET]; + + content_length_buffer[0] = (uint8_t) ((len & 0xFF000000) >> 24); + content_length_buffer[1] = (uint8_t) ((len & 0x00FF0000) >> 16); + content_length_buffer[2] = (uint8_t) ((len & 0x0000FF00) >> 8); + content_length_buffer[3] = (uint8_t) ((len & 0x000000FF) >> 0); +} + /* Get content length */ -static inline ssize_t cio_file_st_get_content_len(char *map, size_t size, - size_t page_size) +static inline ssize_t cio_file_st_get_content_len(char *map, + size_t size, + size_t page_size, + int tainted_data_flag) { uint8_t *content_length_buffer; + uint8_t *content_buffer; ssize_t content_length; + ssize_t content_offset; if (size < CIO_FILE_HEADER_MIN) { return -1; } + content_offset = CIO_FILE_CONTENT_OFFSET + 2 + cio_file_st_get_meta_len(map); + content_length_buffer = (uint8_t *) &map[CIO_FILE_CONTENT_LENGTH_OFFSET]; content_length = (ssize_t) (((uint32_t) content_length_buffer[0]) << 24) | @@ -134,37 +153,29 @@ static inline ssize_t cio_file_st_get_content_len(char *map, size_t size, * previous versions of chunkio that didn't include the content length * as part of the headers. * - * The reason why we need to ensure that the file size is larger than 4096 - * is that this is the minimal expected page size which is the unit used - * to initialize chunk files when they are created. - * - * In doing so, we effectively avoid returning bogus results when loading - * newly created, non trimmed files while at the same time retaining the - * capability of loading legacy files (that don't have a content size) - * that are larger than 4096 bytes. + * tainted_data_flag is used to differentiate non trimmed files being + * loaded from files whoses chunk data is grown over the threshold and + * shrinked by the filter stack. * - * The only caveat is that trimmed files + * Because even when the content size is set to zero the data is not + * zeroed out (nor is the file shrinked) we can compare the first + * byte of the content section against zero to ensure that it's a + * valid msgpack serialized payload. */ - if (content_length == 0 && - size > 0 && - size != page_size) { - content_length = cio_file_st_infer_content_len(map, size); - } - return content_length; -} + if (!tainted_data_flag && + content_length == 0 && + size > content_offset) { + content_buffer = (uint8_t *) &map[content_offset]; -/* Set content length */ -static inline void cio_file_st_set_content_len(char *map, uint32_t len) -{ - uint8_t *content_length_buffer; + if (content_buffer[0] != 0x00) { + content_length = cio_file_st_infer_content_len(map, size); - content_length_buffer = (uint8_t *) &map[CIO_FILE_CONTENT_LENGTH_OFFSET]; + cio_file_st_set_content_len(map, content_length); + } + } - content_length_buffer[0] = (uint8_t) ((len & 0xFF000000) >> 24); - content_length_buffer[1] = (uint8_t) ((len & 0x00FF0000) >> 16); - content_length_buffer[2] = (uint8_t) ((len & 0x0000FF00) >> 8); - content_length_buffer[3] = (uint8_t) ((len & 0x000000FF) >> 0); + return content_length; } #endif diff --git a/src/cio_file.c b/src/cio_file.c index 019baa8..a5285f8 100644 --- a/src/cio_file.c +++ b/src/cio_file.c @@ -80,13 +80,15 @@ void cio_file_calculate_checksum(struct cio_file *cf, crc_t *out) content_length = cio_file_st_get_content_len(cf->map, cf->fs_size, - cf->page_size); + cf->page_size, + cf->taint_flag); if (content_length > 0) { len += content_length; } in_data = (unsigned char *) cf->map + CIO_FILE_CONTENT_OFFSET; + val = cio_crc32_update(cf->crc_cur, in_data, len); *out = val; } @@ -236,7 +238,8 @@ static int cio_file_format_check(struct cio_chunk *ch, /* Expected / logical file size verification */ content_length = cio_file_st_get_content_len(cf->map, cf->fs_size, - cf->page_size); + cf->page_size, + cf->taint_flag); if (content_length == -1) { cio_log_debug(ch->ctx, "[cio file] truncated header (%zu / %zu) %s", @@ -420,7 +423,8 @@ static int mmap_file(struct cio_ctx *ctx, struct cio_chunk *ch, size_t size) if (fs_size > 0) { content_size = cio_file_st_get_content_len(cf->map, fs_size, - cf->page_size); + cf->page_size, + cf->taint_flag); if (content_size == -1) { cio_error_set(ch, CIO_ERR_BAD_FILE_SIZE); @@ -653,6 +657,7 @@ struct cio_file *cio_file_open(struct cio_ctx *ctx, cf->realloc_size = CIO_REALLOC_HINT_MIN; } + cf->taint_flag = CIO_FALSE; cf->st_content = NULL; cf->crc_cur = cio_crc32_init(); cf->path = path; @@ -1028,6 +1033,8 @@ int cio_file_write(struct cio_chunk *ch, const void *buf, size_t count) cio_file_st_set_content_len(cf->map, cf->data_size); + cf->taint_flag = CIO_TRUE; + return 0; }