-
Notifications
You must be signed in to change notification settings - Fork 0
/
gist_private.h
563 lines (477 loc) · 19.8 KB
/
gist_private.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
/*-------------------------------------------------------------------------
*
* gist_private.h
* private declarations for GiST -- declarations related to the
* internal implementation of GiST, not the public API
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/gist_private.h
*
*-------------------------------------------------------------------------
*/
#ifndef GIST_PRIVATE_H
#define GIST_PRIVATE_H
#include "access/amapi.h"
#include "gist.h"
#include "access/itup.h"
#include "fmgr.h"
#include "lib/pairingheap.h"
#include "storage/bufmgr.h"
#include "storage/buffile.h"
#include "utils/hsearch.h"
#include "access/genam.h"
/*
* Maximum number of "halves" a page can be split into in one operation.
* Typically a split produces 2 halves, but can be more if keys have very
* different lengths, or when inserting multiple keys in one operation (as
* when inserting downlinks to an internal node). There is no theoretical
* limit on this, but in practice if you get more than a handful page halves
* in one split, there's something wrong with the opclass implementation.
* GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some
* local arrays used during split. Note that there is also a limit on the
* number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS,
* so if you raise this higher than that limit, you'll just get a different
* error.
*/
#define GIST_MAX_SPLIT_PAGES 75
#define GIST_SKIPGROUP_THRESHOLD 16
/* Buffer lock modes */
#define GIST_SHARE BUFFER_LOCK_SHARE
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
typedef struct
{
BlockNumber prev;
uint32 freespace;
char tupledata[FLEXIBLE_ARRAY_MEMBER];
} GISTNodeBufferPage;
#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
/* Returns free space in node buffer page */
#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
/* Checks if node buffer page is empty */
#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
/* Checks if node buffers page don't contain sufficient space for index tuple */
#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
MAXALIGN(IndexTupleSize(itup)))
/*
* GISTSTATE: information needed for any GiST index operation
*
* This struct retains call info for the index's opclass-specific support
* functions (per index column), plus the index's tuple descriptor.
*
* scanCxt holds the GISTSTATE itself as well as any data that lives for the
* lifetime of the index operation. We pass this to the support functions
* via fn_mcxt, so that they can store scan-lifespan data in it. The
* functions are invoked in tempCxt, which is typically short-lifespan
* (that is, it's reset after each tuple). However, tempCxt can be the same
* as scanCxt if we're not bothering with per-tuple context resets.
*/
typedef struct GISTSTATE
{
MemoryContext scanCxt; /* context for scan-lifespan data */
MemoryContext tempCxt; /* short-term context for calling functions */
TupleDesc leafTupdesc; /* index's tuple descriptor */
TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf
* pages */
TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an
* index-only scan */
FmgrInfo consistentFn[INDEX_MAX_KEYS];
FmgrInfo unionFn[INDEX_MAX_KEYS];
FmgrInfo compressFn[INDEX_MAX_KEYS];
FmgrInfo decompressFn[INDEX_MAX_KEYS];
FmgrInfo penaltyFn[INDEX_MAX_KEYS];
FmgrInfo picksplitFn[INDEX_MAX_KEYS];
FmgrInfo equalFn[INDEX_MAX_KEYS];
FmgrInfo distanceFn[INDEX_MAX_KEYS];
FmgrInfo fetchFn[INDEX_MAX_KEYS];
/* Collations to pass to the support functions */
Oid supportCollation[INDEX_MAX_KEYS];
} GISTSTATE;
/*
* During a GiST index search, we must maintain a queue of unvisited items,
* which can be either individual heap tuples or whole index pages. If it
* is an ordered search, the unvisited items should be visited in distance
* order. Unvisited items at the same distance should be visited in
* depth-first order, that is heap items first, then lower index pages, then
* upper index pages; this rule avoids doing extra work during a search that
* ends early due to LIMIT.
*
* To perform an ordered search, we use a pairing heap to manage the
* distance-order queue. In a non-ordered search (no order-by operators),
* we use it to return heap tuples before unvisited index pages, to
* ensure depth-first order, but all entries are otherwise considered
* equal.
*/
/* Individual heap tuple to be visited */
typedef struct GISTSearchHeapItem
{
ItemPointerData heapPtr;
bool recheck; /* T if quals must be rechecked */
bool recheckDistances; /* T if distances must be rechecked */
HeapTuple recontup; /* data reconstructed from the index, used in
* index-only scans */
OffsetNumber offnum; /* track offset in page to mark tuple as
* LP_DEAD */
} GISTSearchHeapItem;
/* Unvisited item, either index page or heap tuple */
typedef struct GISTSearchItem
{
pairingheap_node phNode;
BlockNumber blkno; /* index page number, or InvalidBlockNumber */
union
{
GistNSN parentlsn; /* parent page's LSN, if index page */
/* we must store parentlsn to detect whether a split occurred */
GISTSearchHeapItem heap; /* heap info, if heap tuple */
} data;
double distances[FLEXIBLE_ARRAY_MEMBER]; /* numberOfOrderBys
* entries */
} GISTSearchItem;
#define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber)
#define SizeOfGISTSearchItem(n_distances) (offsetof(GISTSearchItem, distances) + sizeof(double) * (n_distances))
/*
* GISTScanOpaqueData: private state for a scan of a GiST index
*/
typedef struct GISTScanOpaqueData
{
GISTSTATE *giststate; /* index information, see above */
Oid *orderByTypes; /* datatypes of ORDER BY expressions */
pairingheap *queue; /* queue of unvisited items */
MemoryContext queueCxt; /* context holding the queue */
bool qual_ok; /* false if qual can never be satisfied */
bool firstCall; /* true until first gistgettuple call */
/* pre-allocated workspace arrays */
double *distances; /* output area for gistindex_keytest */
/* info about killed items if any (killedItems is NULL if never used) */
OffsetNumber *killedItems; /* offset numbers of killed items */
int numKilled; /* number of currently stored items */
BlockNumber curBlkno; /* current number of block */
GistNSN curPageLSN; /* pos in the WAL stream when page was read */
/* In a non-ordered search, returnable heap items are stored here: */
GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
OffsetNumber nPageData; /* number of valid items in array */
OffsetNumber curPageData; /* next item to return */
MemoryContext pageDataCxt; /* context holding the fetched tuples, for
* index-only scans */
} GISTScanOpaqueData;
typedef GISTScanOpaqueData *GISTScanOpaque;
/* despite the name, gistxlogPage is not part of any xlog record */
typedef struct gistxlogPage
{
BlockNumber blkno;
int num; /* number of index tuples following */
} gistxlogPage;
/* SplitedPageLayout - gistSplit function result */
typedef struct SplitedPageLayout
{
gistxlogPage block;
IndexTupleData *list;
int lenlist;
IndexTuple itup; /* union key for page */
Page page; /* to operate */
Buffer buffer; /* to write after all proceed */
struct SplitedPageLayout *next;
} SplitedPageLayout;
/*
* GISTInsertStack used for locking buffers and transfer arguments during
* insertion
*/
typedef struct GISTInsertStack
{
/* current page */
BlockNumber blkno;
Buffer buffer;
Page page;
/*
* log sequence number from page->lsn to recognize page update and compare
* it with page's nsn to recognize page split
*/
GistNSN lsn;
/* offset of the downlink in the parent page, that points to this page */
OffsetNumber downlinkoffnum;
OffsetNumber skipoffnum;
/* pointer to parent */
struct GISTInsertStack *parent;
} GISTInsertStack;
/* Working state and results for multi-column split logic in gistsplit.c */
typedef struct GistSplitVector
{
GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */
Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in
* splitVector.spl_left */
bool spl_lisnull[INDEX_MAX_KEYS];
Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in
* splitVector.spl_right */
bool spl_risnull[INDEX_MAX_KEYS];
bool *spl_dontcare; /* flags tuples which could go to either side
* of the split for zero penalty */
} GistSplitVector;
typedef struct
{
Relation r;
Relation heapRel;
Size freespace; /* free space to be left */
bool is_build;
GISTInsertStack *stack;
} GISTInsertState;
/* root page of a gist index */
#define GIST_ROOT_BLKNO 0
/*
* Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on
* inner pages to finish crash recovery of incomplete page splits. If a crash
* happened in the middle of a page split, so that the downlink pointers were
* not yet inserted, crash recovery inserted a special downlink pointer. The
* semantics of an invalid tuple was that it if you encounter one in a scan,
* it must always be followed, because we don't know if the tuples on the
* child page match or not.
*
* We no longer create such invalid tuples, we now mark the left-half of such
* an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
* split properly the next time we need to insert on that page. To retain
* on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
* the offset number of all inner tuples. If we encounter any invalid tuples
* with 0xfffe during insertion, we throw an error, though scans still handle
* them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
* gist index which already has invalid tuples in it because of a crash. That
* should be rare, and you are recommended to REINDEX anyway if you have any
* invalid tuples in an index, so throwing an error is as far as we go with
* supporting that.
*/
#define TUPLE_IS_VALID 0xffff
#define TUPLE_IS_INVALID 0xfffe
#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
#define INDEX_SKIP_MASK 0x2000
#define GistTupleIsSkip(itup) ((((IndexTuple) (itup))->t_info & INDEX_SKIP_MASK))
#define GistTupleMakeSkip(itup) ((((IndexTuple) (itup))->t_info |= INDEX_SKIP_MASK))
#define GistTupleSetSkipCount(itup, skipTupleCnt) ((((IndexTuple) (itup))->t_tid.ip_posid = skipTupleCnt))
#define GistTupleGetSkipCount(itup) ((((IndexTuple) (itup))->t_tid.ip_posid))
/*
* A buffer attached to an internal node, used when building an index in
* buffering mode.
*/
typedef struct
{
BlockNumber nodeBlocknum; /* index block # this buffer is for */
int32 blocksCount; /* current # of blocks occupied by buffer */
BlockNumber pageBlocknum; /* temporary file block # */
GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
/* is this buffer queued for emptying? */
bool queuedForEmptying;
/* is this a temporary copy, not in the hash table? */
bool isTemp;
int level; /* 0 == leaf */
} GISTNodeBuffer;
/*
* Does specified level have buffers? (Beware of multiple evaluation of
* arguments.)
*/
#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
(nlevel) != (gfbb)->rootlevel)
/* Is specified buffer at least half-filled (should be queued for emptying)? */
#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
/*
* Is specified buffer full? Our buffers can actually grow indefinitely,
* beyond the "maximum" size, so this just means whether the buffer has grown
* beyond the nominal maximum size.
*/
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
/*
* Data structure with general information about build buffers.
*/
typedef struct GISTBuildBuffers
{
/* Persistent memory context for the buffers and metadata. */
MemoryContext context;
BufFile *pfile; /* Temporary file to store buffers in */
long nFileBlocks; /* Current size of the temporary file */
/*
* resizable array of free blocks.
*/
long *freeBlocks;
int nFreeBlocks; /* # of currently free blocks in the array */
int freeBlocksLen; /* current allocated length of the array */
/* Hash for buffers by block number */
HTAB *nodeBuffersTab;
/* List of buffers scheduled for emptying */
List *bufferEmptyingQueue;
/*
* Parameters to the buffering build algorithm. levelStep determines which
* levels in the tree have buffers, and pagesPerBuffer determines how
* large each buffer is.
*/
int levelStep;
int pagesPerBuffer;
/* Array of lists of buffers on each level, for final emptying */
List **buffersOnLevels;
int buffersOnLevelsLen;
/*
* Dynamically-sized array of buffers that currently have their last page
* loaded in main memory.
*/
GISTNodeBuffer **loadedBuffers;
int loadedBuffersCount; /* # of entries in loadedBuffers */
int loadedBuffersLen; /* allocated size of loadedBuffers */
/* Level of the current root node (= height of the index tree - 1) */
int rootlevel;
} GISTBuildBuffers;
/*
* Storage type for GiST's reloptions
*/
typedef struct GiSTOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
int bufferingModeOffset; /* use buffering build? */
} GiSTOptions;
/* gist.c */
extern void gistbuildempty(Relation index);
extern bool gistinsert(Relation r, Datum *values, bool *isnull,
ItemPointer ht_ctid, Relation heapRel,
IndexUniqueCheck checkUnique,
struct IndexInfo *indexInfo);
extern MemoryContext createTempGistContext(void);
extern GISTSTATE *initGISTstate(Relation index);
extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
GISTSTATE *GISTstate,
Relation heapRel,
bool is_build);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
{
Buffer buf; /* the split page "half" */
IndexTuple downlink; /* downlink for this half. */
} GISTPageSplitInfo;
extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
Buffer buffer,
IndexTuple *itup, int ntup,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
bool markleftchild,
Relation heapRel,
bool is_build,
int ndeltup,
OffsetNumber skipoffnum);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
/* gistxlog.c */
extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
TransactionId xid, Buffer parentBuffer,
OffsetNumber downlinkOffset);
extern void gistXLogPageReuse(Relation rel, BlockNumber blkno,
TransactionId latestRemovedXid);
extern SplitedPageLayout *gistSplitBySkipgroup(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ntup,
Buffer leftchild, OffsetNumber skipoffnum);
extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
int ntodelete, RelFileNode hnode);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
Buffer leftchild, bool markfollowright);
/* gistget.c */
extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
extern bool gistcanreturn(Relation index, int attno);
/* gistvalidate.c */
extern bool gistvalidate(Oid opclassoid);
/* gistutil.c */
#define GiSTPageSize \
( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
#define GIST_MIN_FILLFACTOR 10
#define GIST_DEFAULT_FILLFACTOR 90
extern bytea *gistoptions(Datum reloptions, bool validate);
extern bool gistproperty(Oid index_oid, int attno,
IndexAMProperty prop, const char *propname,
bool *res, bool *isnull);
extern bool gistfitpage(IndexTuple *itvec, int len);
extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace, int ndeltup);
extern void gistcheckpage(Relation rel, Buffer buf);
extern Buffer gistNewBuffer(Relation r);
extern bool gistPageRecyclable(Page page);
extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
OffsetNumber off);
extern IndexTuple *gistextractpage(Page page, int *len /* out */);
extern IndexTuple *gistextractrange(Page page, OffsetNumber start, int len);
extern IndexTuple *gistjoinvector(
IndexTuple *itvec, int *len,
IndexTuple *additvec, int addlen, OffsetNumber oldoffnum);
extern void gistfiltervector(IndexTuple *itvec, int *len);
extern inline void gistcheckskippage(Page page);
extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
int len, GISTSTATE *giststate);
extern IndexTuple gistgetadjusted(Relation r,
IndexTuple oldtup,
IndexTuple addtup,
GISTSTATE *giststate);
extern IndexTuple gistFormTuple(GISTSTATE *giststate,
Relation r, Datum *attdata, bool *isnull, bool isleaf);
extern OffsetNumber gistchoose(Relation r, Page p,
IndexTuple it,
GISTSTATE *giststate,
OffsetNumber *skipoffnum);
extern void GISTInitBuffer(Buffer b, uint32 f);
extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
Datum k, Relation r, Page pg, OffsetNumber o,
bool l, bool isNull);
extern float gistpenalty(GISTSTATE *giststate, int attno,
GISTENTRY *key1, bool isNull1,
GISTENTRY *key2, bool isNull2);
extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
Datum *attr, bool *isnull);
extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
OffsetNumber o, GISTENTRY *attdata, bool *isnull);
extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r,
IndexTuple tuple);
extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
GISTENTRY *entry1, bool isnull1,
GISTENTRY *entry2, bool isnull2,
Datum *dst, bool *dstisnull);
extern XLogRecPtr gistGetFakeLSN(Relation rel);
/* gistvacuum.c */
extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback,
void *callback_state);
extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats);
/* gistsplit.c */
extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate,
GistSplitVector *v,
int attno);
/* gistbuild.c */
extern IndexBuildResult *gistbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo);
extern void gistValidateBufferingOption(const char *value);
/* gistbuildbuffers.c */
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
int maxLevel);
extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
GISTSTATE *giststate,
BlockNumber blkno, int level);
extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
GISTNodeBuffer *nodeBuffer, IndexTuple item);
extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
GISTNodeBuffer *nodeBuffer, IndexTuple *item);
extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
GISTSTATE *giststate, Relation r,
int level, Buffer buffer,
List *splitinfo);
extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
#endif /* GIST_PRIVATE_H */