From 8f342268c662d8415a674f47b9f1f1e3fbc9c992 Mon Sep 17 00:00:00 2001 From: blankie Date: Sat, 27 Jan 2024 08:36:29 +1100 Subject: [PATCH 1/5] Fix possible null pointer write or memory corruption during CDX read --- src/warc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/warc.c b/src/warc.c index e025145b8..50822225b 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1508,7 +1508,6 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, bytes. */ idx_t checksum_l; char * checksum_v; - char *digest; base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l); xfree (checksum); @@ -1516,8 +1515,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) { /* This is a valid line with a valid checksum. */ - memcpy (digest, checksum_v, SHA1_DIGEST_SIZE); - store_warc_record(original_url, NULL, record_id, digest); + store_warc_record(original_url, NULL, record_id, checksum_v); xfree (checksum_v); } else From cb35e761a1192706f059c4068e742662f92ade6f Mon Sep 17 00:00:00 2001 From: blankie Date: Sat, 27 Jan 2024 08:47:33 +1100 Subject: [PATCH 2/5] Fix null pointer read while parsing CDX file --- src/warc.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 12 deletions(-) diff --git a/src/warc.c b/src/warc.c index 50822225b..311fe7b4c 100644 --- a/src/warc.c +++ b/src/warc.c @@ -850,6 +850,42 @@ warc_timestamp (char *timestamp, size_t timestamp_size) return timestamp; } +/* Checks if the timestamp passed is a valid CDX-style timestamp. */ +static bool +is_valid_cdx_timestamp (char *timestamp) +{ + for (size_t i = 0; i < 14; i++) + { + if (!c_isdigit (timestamp[i])) + { + return false; + } + } + + return timestamp[14] == 0; +} + +static char * +cdx_to_warc_timestamp (char *cdx_timestamp) +{ + char *warc_timestamp = xmalloc (21); + memcpy (warc_timestamp , cdx_timestamp , 4); /* YYYY */ + warc_timestamp[4] = '-'; /* - */ + memcpy (warc_timestamp + 5 , cdx_timestamp + 4 , 2); /* MM */ + warc_timestamp[7] = '-'; /* - */ + memcpy (warc_timestamp + 8 , cdx_timestamp + 6 , 2); /* DD */ + warc_timestamp[10] = 'T'; /* T */ + memcpy (warc_timestamp + 11, cdx_timestamp + 8 , 2); /* hh */ + warc_timestamp[13] = ':'; /* : */ + memcpy (warc_timestamp + 14, cdx_timestamp + 10, 2); /* mm */ + warc_timestamp[16] = ':'; /* : */ + memcpy (warc_timestamp + 17, cdx_timestamp + 12, 2); /* ss */ + warc_timestamp[19] = 'Z'; /* Z */ + warc_timestamp[20] = 0; + + return warc_timestamp; +} + /* Fills urn_str with a UUID in the format required for the WARC-Record-Id header. The string will be 47 characters long. */ @@ -1427,7 +1463,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid, checksum and record ID fields. */ static bool warc_parse_cdx_header (char *lineptr, int *field_num_original_url, - int *field_num_checksum, int *field_num_record_id) + int *field_num_date, int *field_num_checksum, + int *field_num_record_id) { char *token; char *save_ptr; @@ -1451,6 +1488,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, case 'a': *field_num_original_url = field_num; break; + case 'b': + *field_num_date = field_num; + break; case 'k': *field_num_checksum = field_num; break; @@ -1464,16 +1504,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, } return *field_num_original_url != -1 + && *field_num_date != -1 && *field_num_checksum != -1 && *field_num_record_id != -1; } /* Parse the CDX record and add it to the warc_dedup_table hash table. */ -static void +static bool warc_process_cdx_line (char *lineptr, int field_num_original_url, - int field_num_checksum, int field_num_record_id) + int field_num_date, int field_num_checksum, + int field_num_record_id) { char *original_url = NULL; + char *date = NULL; char *checksum = NULL; char *record_id = NULL; char *token; @@ -1487,6 +1530,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, char **val; if (field_num == field_num_original_url) val = &original_url; + else if (field_num == field_num_date) + val = &date; else if (field_num == field_num_checksum) val = &checksum; else if (field_num == field_num_record_id) @@ -1501,13 +1546,28 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, field_num++; } - if (original_url != NULL && checksum != NULL && record_id != NULL) + if (original_url != NULL + && date != NULL + && checksum != NULL + && record_id != NULL) { + if (!is_valid_cdx_timestamp (date)) + { + logprintf (LOG_NOTQUIET, + _("CDX line contains invalid timestamp (%s).\n"), + quote (date)); + xfree (original_url); + xfree (date); + xfree (checksum); + xfree (record_id); + return false; + } + /* For some extra efficiency, we decode the base32 encoded checksum value. This should produce exactly SHA1_DIGEST_SIZE bytes. */ idx_t checksum_l; - char * checksum_v; + char *checksum_v, *warc_date; base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l); xfree (checksum); @@ -1515,12 +1575,15 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) { /* This is a valid line with a valid checksum. */ - store_warc_record(original_url, NULL, record_id, checksum_v); + warc_date = cdx_to_warc_timestamp(date); + store_warc_record(original_url, warc_date, record_id, checksum_v); + xfree (date); xfree (checksum_v); } else { xfree (original_url); + xfree (date); xfree (checksum_v); xfree (record_id); } @@ -1529,8 +1592,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, { xfree(checksum); xfree(original_url); + xfree(date); xfree(record_id); } + + return true; } /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills @@ -1543,6 +1609,7 @@ warc_load_cdx_dedup_file (void) size_t n = 0; ssize_t line_length; int field_num_original_url = -1; + int field_num_date = -1; int field_num_checksum = -1; int field_num_record_id = -1; @@ -1553,21 +1620,26 @@ warc_load_cdx_dedup_file (void) /* The first line should contain the CDX header. Format: " CDX x x x x x" where x are field type indicators. For our purposes, we only - need 'a' (the original url), 'k' (the SHA1 checksum) and - 'u' (the WARC record id). */ + need 'a' (the original url), 'b' (the date), + 'k' (the SHA1 checksum) and 'u' (the WARC record id). */ line_length = getline (&lineptr, &n, f); if (line_length != -1) warc_parse_cdx_header (lineptr, &field_num_original_url, - &field_num_checksum, &field_num_record_id); + &field_num_date, &field_num_checksum, + &field_num_record_id); - /* If the file contains all three fields, read the complete file. */ + /* If the file contains all four fields, read the complete file. */ if (field_num_original_url == -1 + || field_num_date == -1 || field_num_checksum == -1 || field_num_record_id == -1) { if (field_num_original_url == -1) logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n")); + if (field_num_date == -1) + logprintf (LOG_NOTQUIET, +_("CDX file does not list dates. (Missing column 'b'.)\n")); if (field_num_checksum == -1) logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n")); @@ -1586,8 +1658,14 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); line_length = getline (&lineptr, &n, f); if (line_length != -1) { - warc_process_cdx_line (lineptr, field_num_original_url, - field_num_checksum, field_num_record_id); + if (!warc_process_cdx_line (lineptr, field_num_original_url, + field_num_date, field_num_checksum, + field_num_record_id)) + { + xfree (lineptr); + fclose (f); + return false; + } } } From 463b2f68439cbc7e8458476392aa40dc1c7b2186 Mon Sep 17 00:00:00 2001 From: blankie Date: Sat, 27 Jan 2024 08:49:45 +1100 Subject: [PATCH 3/5] Initialize WARC dedup table when CDX dedup file is being read --- src/warc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/warc.c b/src/warc.c index 311fe7b4c..5d5f24458 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1260,10 +1260,6 @@ warc_start_new_file (bool meta) warc_current_file_number++; - /* init the hash table */ - warc_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, - warc_cmp_sha1_digest); - base_filename_length = strlen (opt.warc_filename); /* filename format: base + "-" + 5 digit serial number + ".warc.zst" */ new_filename = xmalloc (base_filename_length + 1 + 5 + 9 + 1); @@ -1651,6 +1647,10 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); { int nrecords; + /* init the hash table */ + warc_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, + warc_cmp_sha1_digest); + /* Load CDX data into the table. */ do From e079345186c19901899064c86d601a59c85b07ae Mon Sep 17 00:00:00 2001 From: blankie Date: Mon, 29 Jan 2024 21:23:44 +1100 Subject: [PATCH 4/5] Fix memory leaks during CDX read --- src/warc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/warc.c b/src/warc.c index 5d5f24458..c265f4f4c 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1573,8 +1573,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, /* This is a valid line with a valid checksum. */ warc_date = cdx_to_warc_timestamp(date); store_warc_record(original_url, warc_date, record_id, checksum_v); + xfree (warc_date); + xfree (original_url); xfree (date); xfree (checksum_v); + xfree (record_id); } else { From 2ac77278319f5b4cf1b054cf02bc19d5c793ddc1 Mon Sep 17 00:00:00 2001 From: blankie Date: Mon, 19 Aug 2024 21:45:32 +1000 Subject: [PATCH 5/5] Initialize WARC dedup table if a WARC is created If this is not done, wget segfaults if --warc-dedup is not passed (but --warc-file and --warc-cdx is) --- src/warc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/warc.c b/src/warc.c index c265f4f4c..1b58dd70e 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1650,10 +1650,6 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); { int nrecords; - /* init the hash table */ - warc_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, - warc_cmp_sha1_digest); - /* Load CDX data into the table. */ do @@ -1732,6 +1728,10 @@ warc_init (void) if (opt.warc_filename != NULL) { + /* init the hash table */ + warc_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, + warc_cmp_sha1_digest); + if (opt.warc_cdx_dedup_filename != NULL) { if (! warc_load_cdx_dedup_file ())