Skip to content

Commit 222e11a

Browse files
committed
Use incremental parsing of backup manifests.
This changes the three callers to json_parse_manifest() to use json_parse_manifest_incremental_chunk() if appropriate. In the case of the backend caller, since we don't know the size of the manifest in advance we always call the incremental parser. Author: Andrew Dunstan Reviewed-By: Jacob Champion Discussion: https://postgr.es/m/7b0a51d6-0d9d-7366-3a1a-f74397a02f55@dunslane.net
1 parent ea7b4e9 commit 222e11a

File tree

3 files changed

+178
-62
lines changed

3 files changed

+178
-62
lines changed

src/backend/backup/basebackup_incremental.c

+42-16
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@
3333

3434
#define BLOCKS_PER_READ 512
3535

36+
/*
37+
* we expect the find the last lines of the manifest, including the checksum,
38+
* in the last MIN_CHUNK bytes of the manifest. We trigger an incremental
39+
* parse step if we are about to overflow MAX_CHUNK bytes.
40+
*/
41+
#define MIN_CHUNK 1024
42+
#define MAX_CHUNK (128 * 1024)
43+
3644
/*
3745
* Details extracted from the WAL ranges present in the supplied backup manifest.
3846
*/
@@ -112,6 +120,11 @@ struct IncrementalBackupInfo
112120
* turns out to be a problem in practice, we'll need to be more clever.
113121
*/
114122
BlockRefTable *brtab;
123+
124+
/*
125+
* State object for incremental JSON parsing
126+
*/
127+
JsonManifestParseIncrementalState *inc_state;
115128
};
116129

117130
static void manifest_process_version(JsonManifestParseContext *context,
@@ -142,6 +155,7 @@ CreateIncrementalBackupInfo(MemoryContext mcxt)
142155
{
143156
IncrementalBackupInfo *ib;
144157
MemoryContext oldcontext;
158+
JsonManifestParseContext *context;
145159

146160
oldcontext = MemoryContextSwitchTo(mcxt);
147161

@@ -157,6 +171,17 @@ CreateIncrementalBackupInfo(MemoryContext mcxt)
157171
*/
158172
ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
159173

174+
context = palloc0(sizeof(JsonManifestParseContext));
175+
/* Parse the manifest. */
176+
context->private_data = ib;
177+
context->version_cb = manifest_process_version;
178+
context->system_identifier_cb = manifest_process_system_identifier;
179+
context->per_file_cb = manifest_process_file;
180+
context->per_wal_range_cb = manifest_process_wal_range;
181+
context->error_cb = manifest_report_error;
182+
183+
ib->inc_state = json_parse_manifest_incremental_init(context);
184+
160185
MemoryContextSwitchTo(oldcontext);
161186

162187
return ib;
@@ -176,13 +201,20 @@ AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
176201
/* Switch to our memory context. */
177202
oldcontext = MemoryContextSwitchTo(ib->mcxt);
178203

179-
/*
180-
* XXX. Our json parser is at present incapable of parsing json blobs
181-
* incrementally, so we have to accumulate the entire backup manifest
182-
* before we can do anything with it. This should really be fixed, since
183-
* some users might have very large numbers of files in the data
184-
* directory.
185-
*/
204+
if (ib->buf.len > MIN_CHUNK && ib->buf.len + len > MAX_CHUNK)
205+
{
206+
/*
207+
* time for an incremental parse. We'll do all but the last MIN_CHUNK
208+
* so that we have enough left for the final piece.
209+
*/
210+
json_parse_manifest_incremental_chunk(
211+
ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false);
212+
/* now remove what we just parsed */
213+
memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK),
214+
MIN_CHUNK + 1);
215+
ib->buf.len = MIN_CHUNK;
216+
}
217+
186218
appendBinaryStringInfo(&ib->buf, data, len);
187219

188220
/* Switch back to previous memory context. */
@@ -196,20 +228,14 @@ AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
196228
void
197229
FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
198230
{
199-
JsonManifestParseContext context;
200231
MemoryContext oldcontext;
201232

202233
/* Switch to our memory context. */
203234
oldcontext = MemoryContextSwitchTo(ib->mcxt);
204235

205-
/* Parse the manifest. */
206-
context.private_data = ib;
207-
context.version_cb = manifest_process_version;
208-
context.system_identifier_cb = manifest_process_system_identifier;
209-
context.per_file_cb = manifest_process_file;
210-
context.per_wal_range_cb = manifest_process_wal_range;
211-
context.error_cb = manifest_report_error;
212-
json_parse_manifest(&context, ib->buf.data, ib->buf.len);
236+
/* Parse the last chunk of the manifest */
237+
json_parse_manifest_incremental_chunk(
238+
ib->inc_state, ib->buf.data, ib->buf.len, true);
213239

214240
/* Done with the buffer, so release memory. */
215241
pfree(ib->buf.data);

src/bin/pg_combinebackup/load_manifest.c

+70-22
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
*/
3535
#define ESTIMATED_BYTES_PER_MANIFEST_LINE 100
3636

37+
/*
38+
* size of json chunk to be read in
39+
*
40+
*/
41+
#define READ_CHUNK_SIZE (128 * 1024)
42+
3743
/*
3844
* Define a hash table which we can use to store information about the files
3945
* mentioned in the backup manifest.
@@ -109,6 +115,7 @@ load_backup_manifest(char *backup_directory)
109115
int rc;
110116
JsonManifestParseContext context;
111117
manifest_data *result;
118+
int chunk_size = READ_CHUNK_SIZE;
112119

113120
/* Open the manifest file. */
114121
snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory);
@@ -133,27 +140,6 @@ load_backup_manifest(char *backup_directory)
133140
/* Create the hash table. */
134141
ht = manifest_files_create(initial_size, NULL);
135142

136-
/*
137-
* Slurp in the whole file.
138-
*
139-
* This is not ideal, but there's currently no way to get pg_parse_json()
140-
* to perform incremental parsing.
141-
*/
142-
buffer = pg_malloc(statbuf.st_size);
143-
rc = read(fd, buffer, statbuf.st_size);
144-
if (rc != statbuf.st_size)
145-
{
146-
if (rc < 0)
147-
pg_fatal("could not read file \"%s\": %m", pathname);
148-
else
149-
pg_fatal("could not read file \"%s\": read %d of %lld",
150-
pathname, rc, (long long int) statbuf.st_size);
151-
}
152-
153-
/* Close the manifest file. */
154-
close(fd);
155-
156-
/* Parse the manifest. */
157143
result = pg_malloc0(sizeof(manifest_data));
158144
result->files = ht;
159145
context.private_data = result;
@@ -162,7 +148,69 @@ load_backup_manifest(char *backup_directory)
162148
context.per_file_cb = combinebackup_per_file_cb;
163149
context.per_wal_range_cb = combinebackup_per_wal_range_cb;
164150
context.error_cb = report_manifest_error;
165-
json_parse_manifest(&context, buffer, statbuf.st_size);
151+
152+
/*
153+
* Parse the file, in chunks if necessary.
154+
*/
155+
if (statbuf.st_size <= chunk_size)
156+
{
157+
buffer = pg_malloc(statbuf.st_size);
158+
rc = read(fd, buffer, statbuf.st_size);
159+
if (rc != statbuf.st_size)
160+
{
161+
if (rc < 0)
162+
pg_fatal("could not read file \"%s\": %m", pathname);
163+
else
164+
pg_fatal("could not read file \"%s\": read %d of %lld",
165+
pathname, rc, (long long int) statbuf.st_size);
166+
}
167+
168+
/* Close the manifest file. */
169+
close(fd);
170+
171+
/* Parse the manifest. */
172+
json_parse_manifest(&context, buffer, statbuf.st_size);
173+
}
174+
else
175+
{
176+
int bytes_left = statbuf.st_size;
177+
JsonManifestParseIncrementalState *inc_state;
178+
179+
inc_state = json_parse_manifest_incremental_init(&context);
180+
181+
buffer = pg_malloc(chunk_size + 1);
182+
183+
while (bytes_left > 0)
184+
{
185+
int bytes_to_read = chunk_size;
186+
187+
/*
188+
* Make sure that the last chunk is sufficiently large. (i.e. at
189+
* least half the chunk size) so that it will contain fully the
190+
* piece at the end with the checksum.
191+
*/
192+
if (bytes_left < chunk_size)
193+
bytes_to_read = bytes_left;
194+
else if (bytes_left < 2 * chunk_size)
195+
bytes_to_read = bytes_left / 2;
196+
rc = read(fd, buffer, bytes_to_read);
197+
if (rc != bytes_to_read)
198+
{
199+
if (rc < 0)
200+
pg_fatal("could not read file \"%s\": %m", pathname);
201+
else
202+
pg_fatal("could not read file \"%s\": read %lld of %lld",
203+
pathname,
204+
(long long int) (statbuf.st_size + rc - bytes_left),
205+
(long long int) statbuf.st_size);
206+
}
207+
bytes_left -= rc;
208+
json_parse_manifest_incremental_chunk(
209+
inc_state, buffer, rc, bytes_left == 0);
210+
}
211+
212+
close(fd);
213+
}
166214

167215
/* All done. */
168216
pfree(buffer);

src/bin/pg_verifybackup/pg_verifybackup.c

+66-24
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
/*
4444
* How many bytes should we try to read from a file at once?
4545
*/
46-
#define READ_CHUNK_SIZE 4096
46+
#define READ_CHUNK_SIZE (128 * 1024)
4747

4848
/*
4949
* Each file described by the manifest file is parsed to produce an object
@@ -399,6 +399,8 @@ parse_manifest_file(char *manifest_path)
399399
JsonManifestParseContext context;
400400
manifest_data *result;
401401

402+
int chunk_size = READ_CHUNK_SIZE;
403+
402404
/* Open the manifest file. */
403405
if ((fd = open(manifest_path, O_RDONLY | PG_BINARY, 0)) < 0)
404406
report_fatal_error("could not open file \"%s\": %m", manifest_path);
@@ -414,28 +416,6 @@ parse_manifest_file(char *manifest_path)
414416
/* Create the hash table. */
415417
ht = manifest_files_create(initial_size, NULL);
416418

417-
/*
418-
* Slurp in the whole file.
419-
*
420-
* This is not ideal, but there's currently no easy way to get
421-
* pg_parse_json() to perform incremental parsing.
422-
*/
423-
buffer = pg_malloc(statbuf.st_size);
424-
rc = read(fd, buffer, statbuf.st_size);
425-
if (rc != statbuf.st_size)
426-
{
427-
if (rc < 0)
428-
report_fatal_error("could not read file \"%s\": %m",
429-
manifest_path);
430-
else
431-
report_fatal_error("could not read file \"%s\": read %d of %lld",
432-
manifest_path, rc, (long long int) statbuf.st_size);
433-
}
434-
435-
/* Close the manifest file. */
436-
close(fd);
437-
438-
/* Parse the manifest. */
439419
result = pg_malloc0(sizeof(manifest_data));
440420
result->files = ht;
441421
context.private_data = result;
@@ -444,7 +424,69 @@ parse_manifest_file(char *manifest_path)
444424
context.per_file_cb = verifybackup_per_file_cb;
445425
context.per_wal_range_cb = verifybackup_per_wal_range_cb;
446426
context.error_cb = report_manifest_error;
447-
json_parse_manifest(&context, buffer, statbuf.st_size);
427+
428+
/*
429+
* Parse the file, in chunks if necessary.
430+
*/
431+
if (statbuf.st_size <= chunk_size)
432+
{
433+
buffer = pg_malloc(statbuf.st_size);
434+
rc = read(fd, buffer, statbuf.st_size);
435+
if (rc != statbuf.st_size)
436+
{
437+
if (rc < 0)
438+
pg_fatal("could not read file \"%s\": %m", manifest_path);
439+
else
440+
pg_fatal("could not read file \"%s\": read %d of %lld",
441+
manifest_path, rc, (long long int) statbuf.st_size);
442+
}
443+
444+
/* Close the manifest file. */
445+
close(fd);
446+
447+
/* Parse the manifest. */
448+
json_parse_manifest(&context, buffer, statbuf.st_size);
449+
}
450+
else
451+
{
452+
int bytes_left = statbuf.st_size;
453+
JsonManifestParseIncrementalState *inc_state;
454+
455+
inc_state = json_parse_manifest_incremental_init(&context);
456+
457+
buffer = pg_malloc(chunk_size + 1);
458+
459+
while (bytes_left > 0)
460+
{
461+
int bytes_to_read = chunk_size;
462+
463+
/*
464+
* Make sure that the last chunk is sufficiently large. (i.e. at
465+
* least half the chunk size) so that it will contain fully the
466+
* piece at the end with the checksum.
467+
*/
468+
if (bytes_left < chunk_size)
469+
bytes_to_read = bytes_left;
470+
else if (bytes_left < 2 * chunk_size)
471+
bytes_to_read = bytes_left / 2;
472+
rc = read(fd, buffer, bytes_to_read);
473+
if (rc != bytes_to_read)
474+
{
475+
if (rc < 0)
476+
pg_fatal("could not read file \"%s\": %m", manifest_path);
477+
else
478+
pg_fatal("could not read file \"%s\": read %lld of %lld",
479+
manifest_path,
480+
(long long int) (statbuf.st_size + rc - bytes_left),
481+
(long long int) statbuf.st_size);
482+
}
483+
bytes_left -= rc;
484+
json_parse_manifest_incremental_chunk(
485+
inc_state, buffer, rc, bytes_left == 0);
486+
}
487+
488+
close(fd);
489+
}
448490

449491
/* Done with the buffer. */
450492
pfree(buffer);

0 commit comments

Comments
 (0)