From dab96a6eb159ffa34ff98f8dfc3b2a6862441e02 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 14 Oct 2024 20:30:21 +0200 Subject: [PATCH] Add more timing histogram and gauge metrics to the Neon extension (#9116) We now also track: - Number of PS IOs in-flight - Number of pages cached by smgr prefetch implementation - IO timing histograms for LFC reads and writes, per IO issued ## Problem There's little insight into the timing metrics of LFC, and what the prefetch state of each backend is. This changes that, by measuring (and subsequently exposing) these data points. ## Summary of changes - Extract IOHistogram as separate type, rather than a collection of fields on NeonMetrics - others, see items above. Part of https://github.com/neondatabase/neon/issues/8926 --- pgxn/neon/file_cache.c | 27 ++++- pgxn/neon/neon_perf_counters.c | 174 +++++++++++++++++++++------------ pgxn/neon/neon_perf_counters.h | 42 ++++++-- pgxn/neon/pagestore_smgr.c | 35 +++++++ 4 files changed, 205 insertions(+), 73 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index d789526050e8..bbea5a8b0d0c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -43,6 +43,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -114,7 +115,9 @@ typedef struct FileCacheControl uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; - uint64 writes; + uint64 writes; /* number of writes issued */ + uint64 time_read; /* time spent reading (us) */ + uint64 time_write; /* time spent writing (us) */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ @@ -270,6 +273,8 @@ lfc_shmem_startup(void) lfc_ctl->hits = 0; lfc_ctl->misses = 0; lfc_ctl->writes = 0; + lfc_ctl->time_read = 0; + lfc_ctl->time_write = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -701,6 +706,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; + uint64 io_time_us = 0; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -795,6 +801,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; + + if (iteration_hits) + { + lfc_ctl->time_read += io_time_us; + inc_page_cache_read_wait(io_time_us); + } + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); @@ -859,6 +872,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, struct iovec iov[PG_IOV_MAX]; int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + instr_time io_start, io_end; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -947,12 +961,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) @@ -965,9 +980,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { + uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += blocks_in_chunk; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a497d387c851..05db18707609 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void) } } -/* - * Count a GetPage wait operation. - */ -void -inc_getpage_wait(uint64 latency_us) +static inline void +inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; - int hi = NUM_GETPAGE_WAIT_BUCKETS - 1; + int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; - if (latency_us < getpage_wait_bucket_thresholds[mid]) + if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } - MyNeonCounters->getpage_wait_us_bucket[lo]++; - MyNeonCounters->getpage_wait_us_sum += latency_us; - MyNeonCounters->getpage_wait_us_count++; + hist->wait_us_bucket[lo]++; + hist->wait_us_sum += latency_us; + hist->wait_us_count++; +} + +/* + * Count a GetPage wait operation. + */ +void +inc_getpage_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->getpage_hist, latency); +} + +/* + * Count an LFC read wait operation. + */ +void +inc_page_cache_read_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); +} + +/* + * Count an LFC write wait operation. + */ +void +inc_page_cache_write_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } /* @@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us) typedef struct { - char *name; + const char *name; bool is_bucket; double bucket_le; double value; } metric_t; -static metric_t * -neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +static int +histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { -#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8) - metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); - uint64 bucket_accum; - int i = 0; + int i = 0; + uint64 bucket_accum = 0; - metrics[i].name = "getpage_wait_seconds_count"; + metrics[i].name = count; metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_wait_us_count; + metrics[i].value = (double) histogram->wait_us_count; i++; - metrics[i].name = "getpage_wait_seconds_sum"; + metrics[i].name = sum; metrics[i].is_bucket = false; - metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0; + metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; - - bucket_accum = 0; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { - uint64 threshold = getpage_wait_bucket_thresholds[bucketno]; + uint64 threshold = io_wait_bucket_thresholds[bucketno]; - bucket_accum += counters->getpage_wait_us_bucket[bucketno]; + bucket_accum += histogram->wait_us_bucket[bucketno]; - metrics[i].name = "getpage_wait_seconds_bucket"; + metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } - metrics[i].name = "getpage_prefetch_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_requests_total; - i++; - metrics[i].name = "getpage_sync_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_sync_requests_total; - i++; - metrics[i].name = "getpage_prefetch_misses_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_misses_total; - i++; - metrics[i].name = "getpage_prefetch_discards_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_discards_total; - i++; - metrics[i].name = "pageserver_requests_sent_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_requests_sent_total; - i++; - metrics[i].name = "pageserver_disconnects_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_disconnects_total; - i++; - metrics[i].name = "pageserver_send_flushes_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_send_flushes_total; - i++; - metrics[i].name = "file_cache_hits_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->file_cache_hits_total; - i++; + + return i; +} + +static metric_t * +neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +{ +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) + metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); + int i = 0; + +#define APPEND_METRIC(_name) do { \ + metrics[i].name = #_name; \ + metrics[i].is_bucket = false; \ + metrics[i].value = (double) counters->_name; \ + i++; \ + } while (false) + + i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); + + APPEND_METRIC(getpage_prefetch_requests_total); + APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(getpage_prefetch_misses_total); + APPEND_METRIC(getpage_prefetch_discards_total); + APPEND_METRIC(pageserver_requests_sent_total); + APPEND_METRIC(pageserver_disconnects_total); + APPEND_METRIC(pageserver_send_flushes_total); + APPEND_METRIC(pageserver_open_requests); + APPEND_METRIC(getpage_prefetches_buffered); + + APPEND_METRIC(file_cache_hits_total); + + i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); Assert(i == NUM_METRICS); +#undef APPEND_METRIC +#undef NUM_METRICS + /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; @@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) return (Datum) 0; } +static inline void +histogram_merge_into(IOHistogram into, IOHistogram from) +{ + into->wait_us_count += from->wait_us_count; + into->wait_us_sum += from->wait_us_sum; + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) + into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - totals.getpage_wait_us_count += counters->getpage_wait_us_count; - totals.getpage_wait_us_sum += counters->getpage_wait_us_sum; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) - totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno]; + histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; + totals.pageserver_open_requests += counters->pageserver_open_requests; + totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; + histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 49d477c4f8ad..8edc658a3025 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -15,17 +15,26 @@ #include "storage/proc.h" #endif -static const uint64 getpage_wait_bucket_thresholds[] = { - 20, 30, 60, 100, /* 0 - 100 us */ +static const uint64 io_wait_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ - 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; -#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds)) +#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) + +typedef struct IOHistogramData +{ + uint64 wait_us_count; + uint64 wait_us_sum; + uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; +} IOHistogramData; + +typedef IOHistogramData *IOHistogram; typedef struct { @@ -39,9 +48,7 @@ typedef struct * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ - uint64 getpage_wait_us_count; - uint64 getpage_wait_us_sum; - uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS]; + IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous @@ -50,7 +57,11 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; - /* XXX: It's not clear to me when these misses happen. */ + /* + * Total number of readahead misses; consisting of either prefetches that + * don't satisfy the LSN bounds, or cases where no readahead was issued + * for the read. + */ uint64 getpage_prefetch_misses_total; /* @@ -80,6 +91,16 @@ typedef struct * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; + + /* + * Number of open requests to PageServer. + */ + uint64 pageserver_open_requests; + + /* + * Number of unused prefetches currently cached in this backend. + */ + uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. @@ -91,6 +112,9 @@ typedef struct */ uint64 file_cache_hits_total; + /* LFC I/O time buckets */ + IOHistogramData file_cache_read_hist; + IOHistogramData file_cache_write_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; #endif extern void inc_getpage_wait(uint64 latency); +extern void inc_page_cache_read_wait(uint64 latency); +extern void inc_page_cache_write_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3d9d9285dfc6..f46df7f70ac4 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra) newPState->n_unused -= 1; } + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { prefetch_set_unused(end); @@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot) MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; @@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void) prefetch_set_unused(ring_index); } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } /* @@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index) MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } else { @@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, hashkey.buftag = tag; Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { @@ -1001,6 +1029,9 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, prefetch_do_request(slot, lsns); } + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + Assert(any_hits); Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || @@ -1076,8 +1107,10 @@ page_server_request(void const *req) { /* do nothing */ } + MyNeonCounters->pageserver_open_requests++; consume_prefetch_responses(); resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; } PG_CATCH(); { @@ -1086,6 +1119,8 @@ page_server_request(void const *req) * point, but this currently seems fine for now. */ page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + PG_RE_THROW(); } PG_END_TRY();