From 526b08a6758af9652414fcd37c4f9ff79ba0da80 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 16 Apr 2024 13:21:36 -0400 Subject: [PATCH 1/5] Add nbtree skip scan optimizations. Teach nbtree composite index scans to opportunistically skip over irrelevant sections of composite indexes given a query with an omitted prefix column. When nbtree is passed input scan keys derived from a query predicate "WHERE b = 5", new nbtree preprocessing steps now output "WHERE a = ANY() AND b = 5" scan keys. That is, preprocessing generates a "skip array" (along with an associated scan key) for the omitted column "a", which makes it safe to mark the scan key on "b" as required to continue the scan. This is far more efficient than a traditional full index scan whenever it allows the scan to skip over many irrelevant leaf pages, by iteratively repositioning itself using the keys on "a" and "b" together. A skip array has "elements" that are generated procedurally and on demand, but otherwise works just like a regular ScalarArrayOp array. Preprocessing can freely add a skip array before or after any input ScalarArrayOp arrays. Index scans with a skip array decide when and where to reposition the scan using the same approach as any other scan with array keys. This design builds on the design for array advancement and primitive scan scheduling added to Postgres 17 by commit 5bf748b8. The core B-Tree operator classes on most discrete types generate their array elements with the help of their own custom skip support routine. This infrastructure gives nbtree a way to generate the next required array element by incrementing (or decrementing) the current array value. It can reduce the number of index descents in cases where the next possible indexable value frequently turns out to be the next value stored in the index. Opclasses that lack a skip support routine fall back on having nbtree "increment" (or "decrement") a skip array's current element by setting the NEXT (or PRIOR) scan key flag, without directly changing the scan key's sk_argument. These sentinel values behave just like any other value from an array -- though they can never locate equal index tuples (they can only locate the next group of index tuples containing the next set of non-sentinel values that the scan's arrays need to advance to). Inequality scan keys can affect how skip arrays generate their values. Their range is constrained by the inequalities. For example, a skip array on "a" will only use element values 1 and 2 given a qual such as "WHERE a BETWEEN 1 AND 2 AND b = 66". A scan using such a skip array has almost identical performance characteristics to one with the qual "WHERE a = ANY('{1, 2}') AND b = 66". The scan will be much faster when it can be executed as two selective primitive index scans instead of a single very large index scan that reads many irrelevant leaf pages. However, the array transformation process won't always lead to improved performance at runtime. Much depends on physical index characteristics. B-Tree preprocessing is optimistic about skipping working out: it applies static, generic rules when determining where to generate skip arrays, which assumes that the runtime overhead of maintaining skip arrays will pay for itself -- or lead to only a modest performance loss. As things stand, these assumptions are much too optimistic: skip array maintenance will lead to unacceptable regressions with unsympathetic queries (queries whose scan can't skip over many irrelevant leaf pages). An upcoming commit will address the problems in this area by enhancing _bt_readpage's approach to saving cycles on scan key evaluation, making it work in a way that directly considers the needs of = array keys (particularly = skip array keys). Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Reviewed-By: Heikki Linnakangas Reviewed-By: Tomas Vondra Reviewed-By: Matthias van de Meent Reviewed-By: Aleksander Alekseev Reviewed-By: Alena Rybakina Discussion: https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP+G4bw@mail.gmail.com --- doc/src/sgml/btree.sgml | 34 +- doc/src/sgml/indexam.sgml | 3 +- doc/src/sgml/indices.sgml | 49 +- doc/src/sgml/monitoring.sgml | 4 +- doc/src/sgml/perform.sgml | 31 + doc/src/sgml/xindex.sgml | 16 +- src/backend/access/index/indexam.c | 3 +- src/backend/access/nbtree/nbtcompare.c | 273 +++++++ src/backend/access/nbtree/nbtpreprocesskeys.c | 628 +++++++++++++-- src/backend/access/nbtree/nbtree.c | 196 ++++- src/backend/access/nbtree/nbtsearch.c | 130 ++- src/backend/access/nbtree/nbtutils.c | 756 ++++++++++++++++-- src/backend/access/nbtree/nbtvalidate.c | 4 + src/backend/commands/opclasscmds.c | 25 + src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/date.c | 46 ++ src/backend/utils/adt/meson.build | 1 + src/backend/utils/adt/selfuncs.c | 491 +++++++++--- src/backend/utils/adt/skipsupport.c | 61 ++ src/backend/utils/adt/timestamp.c | 48 ++ src/backend/utils/adt/uuid.c | 70 ++ src/include/access/amapi.h | 3 +- src/include/access/nbtree.h | 34 +- src/include/catalog/pg_amproc.dat | 22 + src/include/catalog/pg_proc.dat | 27 + src/include/utils/skipsupport.h | 98 +++ src/test/regress/expected/alter_generic.out | 10 +- src/test/regress/expected/btree_index.out | 41 + src/test/regress/expected/create_index.out | 183 ++++- src/test/regress/expected/psql.out | 3 +- src/test/regress/sql/alter_generic.sql | 5 +- src/test/regress/sql/btree_index.sql | 21 + src/test/regress/sql/create_index.sql | 63 +- src/tools/pgindent/typedefs.list | 3 + 34 files changed, 2998 insertions(+), 385 deletions(-) create mode 100644 src/backend/utils/adt/skipsupport.c create mode 100644 src/include/utils/skipsupport.h diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 2b3997988cff..3e6f30d74627 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -207,7 +207,7 @@ As shown in , btree defines - one required and four optional support functions. The five + one required and five optional support functions. The six user-defined methods are: @@ -583,6 +583,38 @@ options(relopts local_relopts *) returns + + skipsupport + + + Optionally, a btree operator family may provide a skip + support function, registered under support function + number 6. These functions allow the B-tree code to more efficiently + navigate the index structure during an index skip scan. Operator classes + that implement skip support provide the core B-Tree code with a way of + enumerating and iterating through every possible value from the domain of + indexable values. The APIs involved in this are defined in + src/include/utils/skipsupport.h. + + + Operator classes that do not provide a skip support function are still + eligible to use skip scan. The core code can still use a fallback + strategy, though it might be somewhat less efficient with discrete types. + It usually doesn't make sense (and may not even be feasible) for operator + classes on continuous types to provide a skip support function. + + + It is not sensible for an operator family to register a cross-type + skipsupport function, and attempting to do so will + result in an error. This is because determining the next indexable value + from some earlier value does not just depend on sorting/equality + semantics, which are more or less defined at the operator family level. + Skip scan works by exhaustively considering every possible value that + might be stored in an index, so the domain of the particular data type + stored within the index (the input opclass type) must also be considered. + + + diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 768b77aa0d2b..d5adb58c163c 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -835,7 +835,8 @@ amrestrpos (IndexScanDesc scan); Size -amestimateparallelscan (int nkeys, +amestimateparallelscan (Relation indexRelation, + int nkeys, int norderbys); Estimate and return the number of bytes of dynamic shared memory which diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 6d731e0701fd..b0cb09eb767c 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -457,23 +457,26 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor); A multicolumn B-tree index can be used with query conditions that involve any subset of the index's columns, but the index is most - efficient when there are constraints on the leading (leftmost) columns. - The exact rule is that equality constraints on leading columns, plus - any inequality constraints on the first column that does not have an - equality constraint, will be used to limit the portion of the index - that is scanned. Constraints on columns to the right of these columns - are checked in the index, so they save visits to the table proper, but - they do not reduce the portion of the index that has to be scanned. + efficient when there are equality constraints on the leading (leftmost) columns. + B-Tree index scans can use the index skip scan strategy to generate + equality constraints on prefix columns that were wholly omitted from the + query predicate, as well as prefix columns whose values were constrained by + inequality conditions. For example, given an index on (a, b, c) and a query condition WHERE a = 5 AND b >= 42 AND c < 77, the index would have to be scanned from the first entry with a = 5 and b = 42 up through the last entry with - a = 5. Index entries with c >= 77 would be - skipped, but they'd still have to be scanned through. + a = 5. Intervening groups of index entries with + c >= 77 would not need to be returned by the scan, + and can be skipped over entirely by applying the skip scan strategy. This index could in principle be used for queries that have constraints on b and/or c with no constraint on a - — but the entire index would have to be scanned, so in most cases - the planner would prefer a sequential table scan over using the index. + — but that approach is generally only taken when there are so few + distinct a values that the planner expects the skip scan + strategy to allow the scan to skip over most individual index leaf pages. + If there are many distinct a values, then the entire + index will have to be scanned, so in most cases the planner will prefer a + sequential table scan over using the index. @@ -508,11 +511,15 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor); - Multicolumn indexes should be used sparingly. In most situations, - an index on a single column is sufficient and saves space and time. - Indexes with more than three columns are unlikely to be helpful - unless the usage of the table is extremely stylized. See also - and + Multicolumn indexes should only be used when testing shows that they'll + offer a clear advantage over simply using multiple single column indexes. + Indexes with more than three columns can make sense, but only when most + queries that make use of later columns also make use of earlier prefix + columns. It's possible for B-Tree index scans to make use of skip + scan optimizations with queries that omit a low cardinality + leading prefix column, but this is usually much less efficient than a scan + of an index without the extra prefix column. See and for some discussion of the merits of different index configurations. @@ -669,9 +676,13 @@ CREATE INDEX test3_desc_index ON test3 (id DESC NULLS LAST); multicolumn index on (x, y). This index would typically be more efficient than index combination for queries involving both columns, but as discussed in , it - would be almost useless for queries involving only y, so it - should not be the only index. A combination of the multicolumn index - and a separate index on y would serve reasonably well. For + would be less useful for queries involving only y. Just + how useful might depend on how effective the B-Tree index skip scan + optimization is; if x has no more than several hundred + distinct values, skip scan will make searches for specific + y values execute reasonably efficiently. A combination + of a multicolumn index on (x, y) and a separate index on + y might also serve reasonably well. For queries involving only x, the multicolumn index could be used, though it would be larger and hence slower than an index on x alone. The last alternative is to create all three diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index a6d67d2fbaa1..34deed8db656 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4263,7 +4263,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage column_name = value2 ... construct, though only when the optimizer transforms the construct into an equivalent - multi-valued array representation. + multi-valued array representation. Similarly, when B-Tree index scans use + the skip scan strategy, an index search is performed each time the scan is + repositioned to the next index leaf page that might have matching tuples. diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 387baac7e8c2..d0470ac796d7 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -860,6 +860,37 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE thousand IN (1, 2, 3, 4); tenk1_thous_tenthous index leaf page. + + The Index Searches line is also useful with B-tree index + scans that apply the skip scan optimization to + more efficiently traverse through an index: + +EXPLAIN ANALYZE SELECT four, unique1 FROM tenk1 WHERE four BETWEEN 1 AND 3 AND unique1 = 42; + QUERY PLAN +-------------------------------------------------------------------&zwsp;--------------------------------------------------------------- + Index Only Scan using tenk1_four_unique1_idx on tenk1 (cost=0.29..6.90 rows=1 width=8) (actual time=0.006..0.007 rows=1.00 loops=1) + Index Cond: ((four >= 1) AND (four <= 3) AND (unique1 = 42)) + Heap Fetches: 0 + Index Searches: 3 + Buffers: shared hit=7 + Planning Time: 0.029 ms + Execution Time: 0.012 ms + + + Here we see an Index-Only Scan node using + tenk1_four_unique1_idx, a composite index on the + tenk1 table's four and + unique1 columns. The scan performs 3 searches + that each read a single index leaf page: + four = 1 AND unique1 = 42, + four = 2 AND unique1 = 42, and + four = 3 AND unique1 = 42. This index + is generally a good target for skip scan, since its leading column (the + four column) contains only 4 distinct values, + while its second/final column (the unique1 + column) contains many distinct values. + + Another type of extra information is the number of rows removed by a filter condition: diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 053619624950..7e23a7b6e432 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -461,6 +461,13 @@ 5 + + + Return the addresses of C-callable skip support function(s) + (optional) + + 6 + @@ -1062,7 +1069,8 @@ DEFAULT FOR TYPE int8 USING btree FAMILY integer_ops AS FUNCTION 1 btint8cmp(int8, int8) , FUNCTION 2 btint8sortsupport(internal) , FUNCTION 3 in_range(int8, int8, int8, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint8skipsupport(internal) ; CREATE OPERATOR CLASS int4_ops DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS @@ -1075,7 +1083,8 @@ DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS FUNCTION 1 btint4cmp(int4, int4) , FUNCTION 2 btint4sortsupport(internal) , FUNCTION 3 in_range(int4, int4, int4, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint4skipsupport(internal) ; CREATE OPERATOR CLASS int2_ops DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS @@ -1088,7 +1097,8 @@ DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS FUNCTION 1 btint2cmp(int2, int2) , FUNCTION 2 btint2sortsupport(internal) , FUNCTION 3 in_range(int2, int2, int2, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint2skipsupport(internal) ; ALTER OPERATOR FAMILY integer_ops USING btree ADD -- cross-type comparisons int8 vs int2 diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 55ec4c103527..219df1971da6 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -489,7 +489,8 @@ index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys, if (parallel_aware && indexRelation->rd_indam->amestimateparallelscan != NULL) nbytes = add_size(nbytes, - indexRelation->rd_indam->amestimateparallelscan(nkeys, + indexRelation->rd_indam->amestimateparallelscan(indexRelation, + nkeys, norderbys)); return nbytes; diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 291cb8fc15d9..4da5a3c1d161 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -58,6 +58,7 @@ #include #include "utils/fmgrprotos.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #ifdef STRESS_SORT_INT_MIN @@ -78,6 +79,51 @@ btboolcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32((int32) a - (int32) b); } +static Datum +bool_decrement(Relation rel, Datum existing, bool *underflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == false) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return BoolGetDatum(bexisting - 1); +} + +static Datum +bool_increment(Relation rel, Datum existing, bool *overflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == true) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return BoolGetDatum(bexisting + 1); +} + +Datum +btboolskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = bool_decrement; + sksup->increment = bool_increment; + sksup->low_elem = BoolGetDatum(false); + sksup->high_elem = BoolGetDatum(true); + + PG_RETURN_VOID(); +} + Datum btint2cmp(PG_FUNCTION_ARGS) { @@ -105,6 +151,51 @@ btint2sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int2_decrement(Relation rel, Datum existing, bool *underflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int16GetDatum(iexisting - 1); +} + +static Datum +int2_increment(Relation rel, Datum existing, bool *overflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int16GetDatum(iexisting + 1); +} + +Datum +btint2skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int2_decrement; + sksup->increment = int2_increment; + sksup->low_elem = Int16GetDatum(PG_INT16_MIN); + sksup->high_elem = Int16GetDatum(PG_INT16_MAX); + + PG_RETURN_VOID(); +} + Datum btint4cmp(PG_FUNCTION_ARGS) { @@ -128,6 +219,51 @@ btint4sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int4_decrement(Relation rel, Datum existing, bool *underflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int32GetDatum(iexisting - 1); +} + +static Datum +int4_increment(Relation rel, Datum existing, bool *overflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int32GetDatum(iexisting + 1); +} + +Datum +btint4skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int4_decrement; + sksup->increment = int4_increment; + sksup->low_elem = Int32GetDatum(PG_INT32_MIN); + sksup->high_elem = Int32GetDatum(PG_INT32_MAX); + + PG_RETURN_VOID(); +} + Datum btint8cmp(PG_FUNCTION_ARGS) { @@ -171,6 +307,51 @@ btint8sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int8_decrement(Relation rel, Datum existing, bool *underflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int64GetDatum(iexisting - 1); +} + +static Datum +int8_increment(Relation rel, Datum existing, bool *overflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int64GetDatum(iexisting + 1); +} + +Datum +btint8skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int8_decrement; + sksup->increment = int8_increment; + sksup->low_elem = Int64GetDatum(PG_INT64_MIN); + sksup->high_elem = Int64GetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum btint48cmp(PG_FUNCTION_ARGS) { @@ -292,6 +473,51 @@ btoidsortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +oid_decrement(Relation rel, Datum existing, bool *underflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == InvalidOid) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return ObjectIdGetDatum(oexisting - 1); +} + +static Datum +oid_increment(Relation rel, Datum existing, bool *overflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == OID_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return ObjectIdGetDatum(oexisting + 1); +} + +Datum +btoidskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = oid_decrement; + sksup->increment = oid_increment; + sksup->low_elem = ObjectIdGetDatum(InvalidOid); + sksup->high_elem = ObjectIdGetDatum(OID_MAX); + + PG_RETURN_VOID(); +} + Datum btoidvectorcmp(PG_FUNCTION_ARGS) { @@ -325,3 +551,50 @@ btcharcmp(PG_FUNCTION_ARGS) /* Be careful to compare chars as unsigned */ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); } + +static Datum +char_decrement(Relation rel, Datum existing, bool *underflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == 0) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return CharGetDatum((uint8) cexisting - 1); +} + +static Datum +char_increment(Relation rel, Datum existing, bool *overflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == UCHAR_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return CharGetDatum((uint8) cexisting + 1); +} + +Datum +btcharskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = char_decrement; + sksup->increment = char_increment; + + /* btcharcmp compares chars as unsigned */ + sksup->low_elem = UInt8GetDatum(0); + sksup->high_elem = UInt8GetDatum(UCHAR_MAX); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 38a87af1cc8b..5c08cda25a7d 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -45,8 +45,15 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok); +static bool _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, + ScanKey skey, FmgrInfo *orderproc, + BTArrayKeyInfo *array, bool *qual_ok); +static bool _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, + BTArrayKeyInfo *array, bool *qual_ok); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); +static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops, + int *numSkipArrayKeys); static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, Datum *elems, int nelems); @@ -89,6 +96,8 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * within each attribute may be done as a byproduct of the processing here. * That process must leave array scan keys (within an attribute) in the same * order as corresponding entries from the scan's BTArrayKeyInfo array info. + * We might also construct skip array scan keys that weren't present in the + * original input keys; these are also output in standard attribute order. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -101,10 +110,16 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD. * For the first attribute without an "=" key, any "<" and "<=" keys are * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD. - * This can be seen to be correct by considering the above example. Note - * in particular that if there are no keys for a given attribute, the keys for - * subsequent attributes can never be required; for instance "WHERE y = 4" - * requires a full-index scan. + * This can be seen to be correct by considering the above example. + * + * If we never generated skip array scan keys, it would be possible for "gaps" + * to appear that make it unsafe to mark any subsequent input scan keys + * (copied from scan->keyData[]) as required to continue the scan. Prior to + * Postgres 18, a qual like "WHERE y = 4" always resulted in a full scan. + * This qual now becomes "WHERE x = ANY('{every possible x value}') and y = 4" + * on output. In other words, preprocessing now adds a skip array on "x". + * This has the potential to be much more efficient than a full index scan + * (though it behaves like a full scan when there's many distinct "x" values). * * If possible, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest keyData[]). + * + * Row comparison keys currently have a couple of notable limitations. + * Right now we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality * comparison on the row's first index column, for the purposes of the logic - * about required keys. + * about required keys. Also, we are unable to merge a row comparison key + * into a skip array (only ordinary inequalities are merged). A key that + * comes after a Row comparison key is therefore never marked as required + * (we won't add a useless skip array that can't be merged with a RowCompare). * * Note: the reason we have to copy the preprocessed scan keys into private * storage is that we are modifying the array based on comparisons of the @@ -200,6 +225,14 @@ _bt_preprocess_keys(IndexScanDesc scan) /* Also maintain keyDataMap for remapping so->orderProcs[] later */ keyDataMap = MemoryContextAlloc(so->arrayContext, numberOfKeys * sizeof(int)); + + /* + * Also enlarge output array when it might otherwise not have room for + * a skip array's scan key + */ + if (numberOfKeys > scan->numberOfKeys) + so->keyData = repalloc(so->keyData, + numberOfKeys * sizeof(ScanKeyData)); } else inkeys = scan->keyData; @@ -229,6 +262,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY); Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber || (so->arrayKeys[0].scan_key == 0 && + !(so->keyData[0].sk_flags & SK_BT_SKIP) && OidIsValid(so->orderProcs[0].fn_oid))); } @@ -288,7 +322,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * redundant. Note that this is no less true if the = key is * SEARCHARRAY; the only real difference is that the inequality * key _becomes_ redundant by making _bt_compare_scankey_args - * eliminate the subset of elements that won't need to be matched. + * eliminate the subset of elements that won't need to be matched + * (with SAOP arrays and skip arrays alike). * * If we have a case like "key = 1 AND key > 2", we set qual_ok to * false and abandon further processing. We'll do the same thing @@ -345,7 +380,6 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - Assert(!array || array->num_elems > 0); xform[j].inkey = NULL; xform[j].inkeyi = -1; } @@ -393,6 +427,11 @@ _bt_preprocess_keys(IndexScanDesc scan) * Emit the cleaned-up keys into the so->keyData[] array, and then * mark them if they are required. They are required (possibly * only in one direction) if all attrs before this one had "=". + * + * In practice we'll rarely output non-required scan keys here; + * typically, _bt_preprocess_array_keys has already added "=" keys + * sufficient to form an unbroken series of "=" constraints on all + * attrs prior to the attr from the final scan->keyData[] key. */ for (j = BTMaxStrategyNumber; --j >= 0;) { @@ -481,6 +520,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(array->scan_key == i); Assert(OidIsValid(orderproc->fn_oid)); + Assert(!(inkey->sk_flags & SK_BT_SKIP)); } else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY) { @@ -489,6 +529,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(array->scan_key == xform[j].inkeyi); Assert(OidIsValid(orderproc->fn_oid)); + Assert(!(xform[j].inkey->sk_flags & SK_BT_SKIP)); } /* @@ -508,8 +549,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* Have all we need to determine redundancy */ if (test_result) { - Assert(!array || array->num_elems > 0); - /* * New key is more restrictive, and so replaces old key... */ @@ -803,6 +842,9 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & + (SK_ROW_HEADER | SK_ROW_MEMBER))); + /* * First, deal with cases where one or both args are NULL. This should * only happen when the scankeys represent IS NULL/NOT NULL conditions. @@ -812,6 +854,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool leftnull, rightnull; + /* Handle skip array comparison with IS NOT NULL scan key */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP) + { + /* Shouldn't generate skip array in presence of IS NULL key */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL)); + Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL); + + /* Skip array will have no NULL element/IS NULL scan key */ + Assert(array->num_elems == -1); + array->null_elem = false; + + /* IS NOT NULL key (could be leftarg or rightarg) now redundant */ + *result = true; + return true; + } + if (leftarg->sk_flags & SK_ISNULL) { Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -885,6 +943,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, { /* Can't make the comparison */ *result = false; /* suppress compiler warnings */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); return false; } @@ -978,24 +1037,55 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, * Compare an array scan key to a scalar scan key, eliminating contradictory * array elements such that the scalar scan key becomes redundant. * + * If the opfamily is incomplete we may not be able to determine which + * elements are contradictory. When we return true we'll have validly set + * *qual_ok, guaranteeing that at least the scalar scan key can be considered + * redundant. We return false if the comparison could not be made (caller + * must keep both scan keys when this happens). + * + * Note: it's up to caller to deal with IS [NOT] NULL scan keys, as well as + * row comparison scan keys. We only deal with scalar scan keys. + */ +static bool +_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok) +{ + Assert(arraysk->sk_attno == skey->sk_attno); + Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert((arraysk->sk_flags & SK_SEARCHARRAY) && + arraysk->sk_strategy == BTEqualStrategyNumber); + /* don't expect to have to deal with NULLs/row comparison scan keys */ + Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!(skey->sk_flags & SK_SEARCHARRAY) || + skey->sk_strategy != BTEqualStrategyNumber); + + /* + * Just call the appropriate helper function based on whether it's a SAOP + * array or a skip array. Both helpers will set *qual_ok in passing. + */ + if (array->num_elems != -1) + return _bt_saoparray_shrink(scan, arraysk, skey, orderproc, array, + qual_ok); + else + return _bt_skiparray_shrink(scan, skey, array, qual_ok); +} + +/* + * Preprocessing of SAOP (non-skip) array scan key, used to determine which + * array elements are eliminated as contradictory by a non-array scalar key. + * _bt_compare_array_scankey_args helper function. + * * Array elements can be eliminated as contradictory when excluded by some * other operator on the same attribute. For example, with an index scan qual * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1" * are eliminated, and the < scan key is eliminated as redundant. Cases where * every array element is eliminated by a redundant scalar scan key have an * unsatisfiable qual, which we handle by setting *qual_ok=false for caller. - * - * If the opfamily doesn't supply a complete set of cross-type ORDER procs we - * may not be able to determine which elements are contradictory. If we have - * the required ORDER proc then we return true (and validly set *qual_ok), - * guaranteeing that at least the scalar scan key can be considered redundant. - * We return false if the comparison could not be made (caller must keep both - * scan keys when this happens). */ static bool -_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, - FmgrInfo *orderproc, BTArrayKeyInfo *array, - bool *qual_ok) +_bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok) { Relation rel = scan->indexRelation; Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; @@ -1006,14 +1096,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey FmgrInfo crosstypeproc; FmgrInfo *orderprocp = orderproc; - Assert(arraysk->sk_attno == skey->sk_attno); Assert(array->num_elems > 0); - Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); - Assert((arraysk->sk_flags & SK_SEARCHARRAY) && - arraysk->sk_strategy == BTEqualStrategyNumber); - Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); - Assert(!(skey->sk_flags & SK_SEARCHARRAY) || - skey->sk_strategy != BTEqualStrategyNumber); + Assert(!(arraysk->sk_flags & SK_BT_SKIP)); /* * _bt_binsrch_array_skey searches an array for the entry best matching a @@ -1112,6 +1196,105 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey return true; } +/* + * Preprocessing of skip (non-SAOP) array scan key, used to determine + * redundancy against a non-array scalar scan key (must be an inequality). + * _bt_compare_array_scankey_args helper function. + * + * Unlike _bt_saoparray_shrink, we don't modify caller's array in-place. Skip + * arrays work by procedurally generating their elements as needed, so we just + * store the inequality as the skip array's low_compare or high_compare. The + * array's elements will be generated from the range of values that satisfies + * both low_compare and high_compare. + */ +static bool +_bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, BTArrayKeyInfo *array, + bool *qual_ok) +{ + bool test_result; + + Assert(array->num_elems == -1); + + /* + * Array's index attribute will be constrained by a strict operator/key. + * Array must not "contain a NULL element" (i.e. the scan must not apply + * "IS NULL" qual when it reaches the end of the index that stores NULLs). + */ + array->null_elem = false; + *qual_ok = true; + + /* + * Consider if we should treat caller's scalar scan key as the skip + * array's high_compare or low_compare. + * + * In general the current array element must either be a copy of a value + * taken from an index tuple, or a derivative value generated by opclass's + * skip support function. That way the scan can always safely assume that + * it's okay to use the input-opclass-only-type proc from so->orderProcs[] + * (they can be cross-type with SAOP arrays, but never with skip arrays). + * + * This approach is enabled by MINVAL/MAXVAL sentinel key markings, which + * can be thought of as representing either the lowest or highest matching + * array element (excluding the NULL element, where applicable, though as + * just discussed it isn't applicable to this range skip array anyway). + * Array keys marked MINVAL/MAXVAL never have a valid datum in their + * sk_argument field. The scan directly applies the array's low_compare + * key when it encounters MINVAL in the array key proper (just as it + * applies high_compare when it sees MAXVAL set in the array key proper). + * The scan must never use the array's so->orderProcs[] proc against + * low_compare's/high_compare's sk_argument, either (so->orderProcs[] is + * only intended to be used with rhs datums from the array proper/index). + */ + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (array->high_compare) + { + /* replace existing high_compare with caller's key? */ + if (!_bt_compare_scankey_args(scan, array->high_compare, skey, + array->high_compare, NULL, NULL, + &test_result)) + return false; /* can't determine more restrictive key */ + + if (!test_result) + return true; /* no, just discard caller's key */ + + /* yes, replace existing high_compare with caller's key */ + } + + /* caller's key becomes skip array's high_compare */ + array->high_compare = skey; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (array->low_compare) + { + /* replace existing low_compare with caller's key? */ + if (!_bt_compare_scankey_args(scan, array->low_compare, skey, + array->low_compare, NULL, NULL, + &test_result)) + return false; /* can't determine more restrictive key */ + + if (!test_result) + return true; /* no, just discard caller's key */ + + /* yes, replace existing low_compare with caller's key */ + } + + /* caller's key becomes skip array's low_compare */ + array->low_compare = skey; + break; + case BTEqualStrategyNumber: + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + return true; +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * @@ -1137,6 +1320,12 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * one equality strategy array scan key per index attribute. We'll always be * able to set things up that way when complete opfamilies are used. * + * We're also responsible for generating skip arrays (and their associated + * scan keys) here. This enables skip scan. We do this for index attributes + * that initially lacked an equality condition within scan->keyData[], iff + * doing so allows a later scan key (that was passed to us in scan->keyData[]) + * to be marked required by our _bt_preprocess_keys caller. + * * We set the scan key references from the scan's BTArrayKeyInfo info array to * offsets into the temp modified input array returned to caller. Scans that * have array keys should call _bt_preprocess_array_keys_final when standard @@ -1144,49 +1333,45 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * references into references to the scan's so->keyData[] output scan keys. * * Note: the reason we need to return a temp scan key array, rather than just - * scribbling on scan->keyData, is that callers are permitted to call btrescan - * without supplying a new set of scankey data. + * modifying scan->keyData[], is that callers are permitted to call btrescan + * without supplying a new set of scankey data. Certain other preprocessing + * routines (e.g., _bt_fix_scankey_strategy) _can_ modify scan->keyData[], but + * we can't make that work here because our modifications are non-idempotent. */ static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - int numberOfKeys = scan->numberOfKeys; int16 *indoption = rel->rd_indoption; + Oid skip_eq_ops[INDEX_MAX_KEYS]; int numArrayKeys, - output_ikey = 0; + numSkipArrayKeys, + numArrayKeyData; + AttrNumber attno_skip = 1; int origarrayatt = InvalidAttrNumber, origarraykey = -1; Oid origelemtype = InvalidOid; - ScanKey cur; MemoryContext oldContext; ScanKey arrayKeyData; /* modified copy of scan->keyData */ - Assert(numberOfKeys); - - /* Quick check to see if there are any array keys */ - numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) - { - cur = &scan->keyData[i]; - if (cur->sk_flags & SK_SEARCHARRAY) - { - numArrayKeys++; - Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); - /* If any arrays are null as a whole, we can quit right now. */ - if (cur->sk_flags & SK_ISNULL) - { - so->qual_ok = false; - return NULL; - } - } - } + /* + * Check the number of input array keys within scan->keyData[] input keys + * (also checks if we should add extra skip arrays based on input keys) + */ + numArrayKeys = _bt_num_array_keys(scan, skip_eq_ops, &numSkipArrayKeys); /* Quit if nothing to do. */ if (numArrayKeys == 0) return NULL; + /* + * Estimated final size of arrayKeyData[] array we'll return to our caller + * is the size of the original scan->keyData[] input array, plus space for + * any additional skip array scan keys we'll need to generate below + */ + numArrayKeyData = scan->numberOfKeys + numSkipArrayKeys; + /* * Make a scan-lifespan context to hold array-associated data, or reset it * if we already have one from a previous rescan cycle. @@ -1201,18 +1386,20 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) oldContext = MemoryContextSwitchTo(so->arrayContext); /* Create output scan keys in the workspace context */ - arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); + arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ - so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); + so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo)); - /* Now process each array key */ numArrayKeys = 0; - for (int input_ikey = 0; input_ikey < numberOfKeys; input_ikey++) + numArrayKeyData = 0; + for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++) { + ScanKey inkey = scan->keyData + input_ikey, + cur; FmgrInfo sortproc; FmgrInfo *sortprocp = &sortproc; Oid elemtype; @@ -1225,21 +1412,113 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) Datum *elem_values; bool *elem_nulls; int num_nonnulls; - int j; + + /* set up next output scan key */ + cur = &arrayKeyData[numArrayKeyData]; + + /* Backfill skip arrays for attrs < or <= input key's attr? */ + while (numSkipArrayKeys && attno_skip <= inkey->sk_attno) + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + Oid collation = rel->rd_indcollation[attno_skip - 1]; + Oid eq_op = skip_eq_ops[attno_skip - 1]; + CompactAttribute *attr; + RegProcedure cmp_proc; + + if (!OidIsValid(eq_op)) + { + /* + * Attribute already has an = input key, so don't output a + * skip array for attno_skip. Just copy attribute's = input + * key into arrayKeyData[] once outside this inner loop. + * + * Note: When we get here there must be a later attribute that + * lacks an equality input key, and still needs a skip array + * (if there wasn't then numSkipArrayKeys would be 0 by now). + */ + Assert(attno_skip == inkey->sk_attno); + /* inkey can't be last input key to be marked required: */ + Assert(input_ikey < scan->numberOfKeys - 1); +#if 0 + /* Could be a redundant input scan key, so can't do this: */ + Assert(inkey->sk_strategy == BTEqualStrategyNumber || + (inkey->sk_flags & SK_SEARCHNULL)); +#endif + + attno_skip++; + break; + } + + cmp_proc = get_opcode(eq_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op); + + ScanKeyEntryInitialize(cur, + SK_SEARCHARRAY | SK_BT_SKIP, /* flags */ + attno_skip, /* skipped att number */ + BTEqualStrategyNumber, /* equality strategy */ + InvalidOid, /* opclass input subtype */ + collation, /* index column's collation */ + cmp_proc, /* equality operator's proc */ + (Datum) 0); /* constant */ + + /* Initialize generic BTArrayKeyInfo fields */ + so->arrayKeys[numArrayKeys].scan_key = numArrayKeyData; + so->arrayKeys[numArrayKeys].num_elems = -1; + + /* Initialize skip array specific BTArrayKeyInfo fields */ + attr = TupleDescCompactAttr(RelationGetDescr(rel), attno_skip - 1); + reverse = (indoption[attno_skip - 1] & INDOPTION_DESC) != 0; + so->arrayKeys[numArrayKeys].attlen = attr->attlen; + so->arrayKeys[numArrayKeys].attbyval = attr->attbyval; + so->arrayKeys[numArrayKeys].null_elem = true; /* for now */ + so->arrayKeys[numArrayKeys].sksup = + PrepareSkipSupportFromOpclass(opfamily, opcintype, reverse); + so->arrayKeys[numArrayKeys].low_compare = NULL; /* for now */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* for now */ + + /* + * We'll need a 3-way ORDER proc. Set that up now. + */ + _bt_setup_array_cmp(scan, cur, opcintype, + &so->orderProcs[numArrayKeyData], NULL); + + numArrayKeys++; + numArrayKeyData++; /* keep this scan key/array */ + + /* set up next output scan key */ + cur = &arrayKeyData[numArrayKeyData]; + + /* remember having output this skip array and scan key */ + numSkipArrayKeys--; + attno_skip++; + } /* * Provisionally copy scan key into arrayKeyData[] array we'll return * to _bt_preprocess_keys caller */ - cur = &arrayKeyData[output_ikey]; - *cur = scan->keyData[input_ikey]; + *cur = *inkey; if (!(cur->sk_flags & SK_SEARCHARRAY)) { - output_ikey++; /* keep this non-array scan key */ + numArrayKeyData++; /* keep this non-array scan key */ continue; } + /* + * Process SAOP array scan key + */ + Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); + + /* If array is null as a whole, the scan qual is unsatisfiable */ + if (cur->sk_flags & SK_ISNULL) + { + so->qual_ok = false; + break; + } + /* * Deconstruct the array into elements */ @@ -1257,7 +1536,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * all btree operators are strict. */ num_nonnulls = 0; - for (j = 0; j < num_elems; j++) + for (int j = 0; j < num_elems; j++) { if (!elem_nulls[j]) elem_values[num_nonnulls++] = elem_values[j]; @@ -1295,7 +1574,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); - output_ikey++; /* keep this transformed scan key */ + numArrayKeyData++; /* keep this transformed scan key */ continue; case BTEqualStrategyNumber: /* proceed with rest of loop */ @@ -1306,7 +1585,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); - output_ikey++; /* keep this transformed scan key */ + numArrayKeyData++; /* keep this transformed scan key */ continue; default: elog(ERROR, "unrecognized StrategyNumber: %d", @@ -1323,7 +1602,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * sortproc just points to the same proc used during binary searches. */ _bt_setup_array_cmp(scan, cur, elemtype, - &so->orderProcs[output_ikey], &sortprocp); + &so->orderProcs[numArrayKeyData], &sortprocp); /* * Sort the non-null elements and eliminate any duplicates. We must @@ -1392,23 +1671,24 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) origelemtype = elemtype; } - /* - * And set up the BTArrayKeyInfo data. - * - * Note: _bt_preprocess_array_keys_final will fix-up each array's - * scan_key field later on, after so->keyData[] has been finalized. - */ - so->arrayKeys[numArrayKeys].scan_key = output_ikey; + /* Initialize generic BTArrayKeyInfo fields */ + so->arrayKeys[numArrayKeys].scan_key = numArrayKeyData; so->arrayKeys[numArrayKeys].num_elems = num_elems; + + /* Initialize SAOP array specific BTArrayKeyInfo fields */ so->arrayKeys[numArrayKeys].elem_values = elem_values; + so->arrayKeys[numArrayKeys].cur_elem = -1; /* i.e. invalid */ + numArrayKeys++; - output_ikey++; /* keep this scan key/array */ + numArrayKeyData++; /* keep this scan key/array */ } + Assert(numSkipArrayKeys == 0); + /* Set final number of equality-type array keys */ so->numArrayKeys = numArrayKeys; - /* Set number of scan keys remaining in arrayKeyData[] */ - *new_numberOfKeys = output_ikey; + /* Set number of scan keys in arrayKeyData[] */ + *new_numberOfKeys = numArrayKeyData; MemoryContextSwitchTo(oldContext); @@ -1514,7 +1794,14 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) { BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; - Assert(array->num_elems > 0); + /* + * All skip arrays must be marked required, and final column can + * never have a skip array + */ + Assert(array->num_elems > 0 || array->num_elems == -1); + Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems != -1 || + outkey->sk_attno < IndexRelationGetNumberOfKeyAttributes(rel)); if (array->scan_key == input_ikey) { @@ -1575,6 +1862,197 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) so->numArrayKeys, INDEX_MAX_KEYS))); } +/* + * _bt_num_array_keys() -- determine # of BTArrayKeyInfo entries + * + * _bt_preprocess_array_keys helper function. Returns the estimated size of + * the scan's BTArrayKeyInfo array, which is guaranteed to be large enough to + * fit every so->arrayKeys[] entry. + * + * Also sets *numSkipArrayKeys to # of skip arrays _bt_preprocess_array_keys + * caller must add to the scan keys it'll output. Caller must add this many + * skip arrays to each of the most significant attributes lacking any keys + * that use the = strategy (IS NULL keys count as = keys here). The specific + * attributes that need skip arrays are indicated by initializing caller's + * skip_eq_ops[] 0-based attribute offset to a valid = op strategy Oid. We'll + * only ever set skip_eq_ops[] entries to InvalidOid for attributes that + * already have an equality key in scan->keyData[] input keys -- and only when + * there's some later "attribute gap" for us to "fill-in" with a skip array. + * + * We're optimistic about skipping working out: we always add exactly the skip + * arrays needed to maximize the number of input scan keys that can ultimately + * be marked as required to continue the scan (but no more). For a composite + * index on (a, b, c, d), we'll instruct caller to add skip arrays as follows: + * + * Input keys Output keys (after all preprocessing) + * ---------- ------------------------------------- + * a = 1 a = 1 (no skip arrays) + * b = 42 skip a AND b = 42 + * a = 1 AND b = 42 a = 1 AND b = 42 (no skip arrays) + * a >= 1 AND b = 42 range skip a AND b = 42 + * a = 1 AND b > 42 a = 1 AND b > 42 (no skip arrays) + * a >= 1 AND a <= 3 AND b = 42 range skip a AND b = 42 + * a = 1 AND c <= 27 a = 1 AND skip b AND c <= 27 + * a = 1 AND d >= 1 a = 1 AND skip b AND skip c AND d >= 1 + * a = 1 AND b >= 42 AND d > 1 a = 1 AND range skip b AND skip c AND d > 1 + */ +static int +_bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops, int *numSkipArrayKeys) +{ + Relation rel = scan->indexRelation; + AttrNumber attno_skip = 1, + attno_inkey = 1; + bool attno_has_equal = false, + attno_has_rowcompare = false; + int numSAOPArrayKeys, + prev_numSkipArrayKeys; + + Assert(scan->numberOfKeys); + + /* Initial pass over input scan keys counts the number of SAOP arrays */ + numSAOPArrayKeys = 0; + prev_numSkipArrayKeys = 0; + *numSkipArrayKeys = 0; + for (int i = 0; i < scan->numberOfKeys; i++) + { + ScanKey inkey = scan->keyData + i; + + if (inkey->sk_flags & SK_SEARCHARRAY) + numSAOPArrayKeys++; + } + +#ifdef DEBUG_DISABLE_SKIP_SCAN + /* don't attempt to add skip arrays */ + return numArrayKeys; +#endif + + for (int i = 0;; i++) + { + ScanKey inkey = scan->keyData + i; + + /* + * Backfill skip arrays for any wholly omitted attributes prior to + * attno_inkey + */ + while (attno_skip < attno_inkey) + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + + /* Look up input opclass's equality operator (might fail) */ + skip_eq_ops[attno_skip - 1] = + get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + if (!OidIsValid(skip_eq_ops[attno_skip - 1])) + { + /* + * Cannot generate a skip array for this or later attributes + * (input opclass lacks an equality strategy operator) + */ + *numSkipArrayKeys = prev_numSkipArrayKeys; + return numSAOPArrayKeys + prev_numSkipArrayKeys; + } + + /* plan on adding a backfill skip array for this attribute */ + (*numSkipArrayKeys)++; + attno_skip++; + } + + prev_numSkipArrayKeys = *numSkipArrayKeys; + + /* + * Stop once past the final input scan key. We deliberately never add + * a skip array for the last input scan key's attribute -- even when + * there are only inequality keys on that attribute. + */ + if (i == scan->numberOfKeys) + break; + + /* + * Later preprocessing steps cannot merge a RowCompare into a skip + * array, so stop adding skip arrays once we see one. (Note that we + * can backfill skip arrays before a RowCompare, which will allow keys + * up to and including the RowCompare to be marked required.) + * + * Skip arrays work by maintaining a current array element value, + * which anchors lower-order keys via an implied equality constraint. + * This is incompatible with the current nbtree row comparison design, + * which compares all columns together, as an indivisible group. + * Alternative designs that can be used alongside skip arrays are + * possible, but it's not clear that they're really worth pursuing. + * + * A RowCompare qual "(a, b, c) > (10, 'foo', 42)" is equivalent to + * "((a=10 AND b='foo' AND c>42) OR (a=10 AND b>'foo') OR (a>10))". + * Such a RowCompare can be decomposed into 3 disjuncts, each of which + * can be executed as a separate "single value" index scan. That'd + * give all 3 scans the ability to add skip arrays in the usual way + * (when there are any scalar low-order keys after the RowCompare). + * Under this scheme, a qual "(a, b, c) > (10, 'foo', 42) AND d = 99" + * performs 3 separate scans, each of which can mark keys up to and + * including its "d = 99" key as required to continue the scan. + */ + if (attno_has_rowcompare) + break; + + /* + * Now consider next attno_inkey (or keep going if this is an + * additional scan key against the same attribute) + */ + if (attno_inkey < inkey->sk_attno) + { + /* + * Now add skip array for previous scan key's attribute, though + * only if the attribute has no equality strategy scan keys + */ + if (attno_has_equal) + { + /* Attributes with an = key must have InvalidOid eq_op set */ + skip_eq_ops[attno_skip - 1] = InvalidOid; + } + else + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + + /* Look up input opclass's equality operator (might fail) */ + skip_eq_ops[attno_skip - 1] = + get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + if (!OidIsValid(skip_eq_ops[attno_skip - 1])) + { + /* + * Input opclass lacks an equality strategy operator, so + * don't generate a skip array that definitely won't work + */ + break; + } + + /* plan on adding a backfill skip array for this attribute */ + (*numSkipArrayKeys)++; + } + + /* Set things up for this new attribute */ + attno_skip++; + attno_inkey = inkey->sk_attno; + attno_has_equal = false; + } + + /* + * Track if this attribute's scan keys include any equality strategy + * scan keys (IS NULL keys count as equality keys here). Also track + * if it has any RowCompare keys. + */ + if (inkey->sk_strategy == BTEqualStrategyNumber || + (inkey->sk_flags & SK_SEARCHNULL)) + attno_has_equal = true; + if (inkey->sk_flags & SK_ROW_HEADER) + attno_has_rowcompare = true; + } + + return numSAOPArrayKeys + *numSkipArrayKeys; +} + /* * _bt_find_extreme_element() -- get least or greatest array element * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4a0bf069f995..bdadbf73cd91 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -31,6 +31,7 @@ #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/read_stream.h" +#include "utils/datum.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" @@ -76,14 +77,26 @@ typedef struct BTParallelScanDescData /* * btps_arrElems is used when scans need to schedule another primitive - * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem + * offsets for each = scan key associated with a ScalarArrayOp array. */ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; + + /* + * Additional space (at the end of the struct) is used when scans need to + * schedule another primitive index scan with one or more skip arrays. + * Holds a flattened datum representation for each = scan key associated + * with a skip array. + */ } BTParallelScanDescData; typedef struct BTParallelScanDescData *BTParallelScanDesc; +static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so); +static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so); static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid); @@ -541,10 +554,167 @@ btrestrpos(IndexScanDesc scan) * btestimateparallelscan -- estimate storage for BTParallelScanDescData */ Size -btestimateparallelscan(int nkeys, int norderbys) +btestimateparallelscan(Relation rel, int nkeys, int norderbys) +{ + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + Size estnbtreeshared, + genericattrspace; + + /* + * Pessimistically assume that every input scan key will be output with + * its own SAOP array + */ + estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) + + sizeof(int) * nkeys; + + /* Single column indexes cannot possibly use a skip array */ + if (nkeyatts == 1) + return estnbtreeshared; + + /* + * Pessimistically assume that all attributes prior to the least + * significant attribute require a skip array (and an associated key) + */ + genericattrspace = datumEstimateSpace((Datum) 0, false, true, + sizeof(Datum)); + for (int attnum = 1; attnum < nkeyatts; attnum++) + { + CompactAttribute *attr; + + /* + * We make the conservative assumption that every index column will + * also require a skip array. + * + * Every skip array must have space to store its scan key's sk_flags. + */ + estnbtreeshared = add_size(estnbtreeshared, sizeof(int)); + + /* Consider space required to store a datum of opclass input type */ + attr = TupleDescCompactAttr(rel->rd_att, attnum - 1); + if (attr->attbyval) + { + /* This index attribute stores pass-by-value datums */ + Size estfixed = datumEstimateSpace((Datum) 0, false, + true, attr->attlen); + + estnbtreeshared = add_size(estnbtreeshared, estfixed); + continue; + } + + /* + * This index attribute stores pass-by-reference datums. + * + * Assume that serializing this array will use just as much space as a + * pass-by-value datum, in addition to space for the largest possible + * whole index tuple (this is not just a per-datum portion of the + * largest possible tuple because that'd be almost as large anyway). + * + * This is quite conservative, but it's not clear how we could do much + * better. The executor requires an up-front storage request size + * that reliably covers the scan's high watermark memory usage. We + * can't be sure of the real high watermark until the scan is over. + */ + estnbtreeshared = add_size(estnbtreeshared, genericattrspace); + estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize); + } + + return estnbtreeshared; +} + +/* + * _bt_parallel_serialize_arrays() -- Serialize parallel array state. + * + * Caller must have exclusively locked btscan->btps_lock when called. + */ +static void +_bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so) +{ + char *datumshared; + + /* Space for serialized datums begins immediately after btps_arrElems[] */ + datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]); + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + if (array->num_elems != -1) + { + /* Save SAOP array's cur_elem (no need to copy key/datum) */ + Assert(!(skey->sk_flags & SK_BT_SKIP)); + btscan->btps_arrElems[i] = array->cur_elem; + continue; + } + + /* Save all mutable state associated with skip array's key */ + Assert(skey->sk_flags & SK_BT_SKIP); + memcpy(datumshared, &skey->sk_flags, sizeof(int)); + datumshared += sizeof(int); + + if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)) + { + /* No sk_argument datum to serialize */ + Assert(skey->sk_argument == 0); + continue; + } + + datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0, + array->attbyval, array->attlen, &datumshared); + } +} + +/* + * _bt_parallel_restore_arrays() -- Restore serialized parallel array state. + * + * Caller must have exclusively locked btscan->btps_lock when called. + */ +static void +_bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so) { - /* Pessimistically assume all input scankeys will be output with arrays */ - return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys; + char *datumshared; + + /* Space for serialized datums begins immediately after btps_arrElems[] */ + datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]); + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + bool isnull; + + if (array->num_elems != -1) + { + /* Restore SAOP array using its saved cur_elem */ + Assert(!(skey->sk_flags & SK_BT_SKIP)); + array->cur_elem = btscan->btps_arrElems[i]; + skey->sk_argument = array->elem_values[array->cur_elem]; + continue; + } + + /* Restore skip array by restoring its key directly */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = (Datum) 0; + memcpy(&skey->sk_flags, datumshared, sizeof(int)); + datumshared += sizeof(int); + + Assert(skey->sk_flags & SK_BT_SKIP); + + if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)) + { + /* No sk_argument datum to restore */ + continue; + } + + skey->sk_argument = datumRestore(&datumshared, &isnull); + if (isnull) + { + Assert(skey->sk_argument == 0); + Assert(skey->sk_flags & SK_SEARCHNULL); + Assert(skey->sk_flags & SK_ISNULL); + } + } } /* @@ -613,6 +783,7 @@ bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; bool exit_loop = false, status = true, @@ -679,14 +850,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, { /* Can start scheduled primitive scan right away, so do so */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; - for (int i = 0; i < so->numArrayKeys; i++) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[array->scan_key]; - array->cur_elem = btscan->btps_arrElems[i]; - skey->sk_argument = array->elem_values[array->cur_elem]; - } + /* Restore scan's array keys from serialized values */ + _bt_parallel_restore_arrays(rel, btscan, so); exit_loop = true; } else @@ -831,6 +997,7 @@ _bt_parallel_done(IndexScanDesc scan) void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; @@ -849,12 +1016,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; /* Serialize scan's current array keys */ - for (int i = 0; i < so->numArrayKeys; i++) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - - btscan->btps_arrElems[i] = array->cur_elem; - } + _bt_parallel_serialize_arrays(rel, btscan, so); } LWLockRelease(&btscan->btps_lock); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 3d46fb5df782..1ef2cb2b55ed 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -983,7 +983,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * one we use --- by definition, they are either redundant or * contradictory. * - * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * In practice we rarely see any "attribute boundary key gaps" here. + * Preprocessing can usually backfill skip array keys for any attributes + * that were omitted from the original scan->keyData[] input keys. All + * array keys are always considered = keys, but we'll sometimes need to + * treat the current key value as if we were using an inequality strategy. + * This happens with range skip arrays, which store inequality keys in the + * array's low_compare/high_compare fields (used to find the first/last + * set of matches, when = key will lack a usable sk_argument value). + * These are always preferred over any redundant "standard" inequality + * keys on the same column (per the usual rule about preferring = keys). + * Note also that any column with an = skip array key can never have an + * additional, contradictory = key. + * + * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP + * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. * If the index stores nulls at the end of the index we'll be starting * from, and we have no boundary key for the column (which means the key * we deduced NOT NULL from is an inequality key that constrains the other @@ -1040,8 +1054,54 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (i >= so->numberOfKeys || cur->sk_attno != curattr) { /* - * Done looking at keys for curattr. If we didn't find a - * usable boundary key, see if we can deduce a NOT NULL key. + * Done looking at keys for curattr. + * + * If this is a scan key for a skip array whose current + * element is MINVAL, choose low_compare (when scanning + * backwards it'll be MAXVAL, and we'll choose high_compare). + * + * Note: if the array's low_compare key makes 'chosen' NULL, + * then we behave as if the array's first element is -inf, + * except when !array->null_elem implies a usable NOT NULL + * constraint. + */ + if (chosen != NULL && + (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + { + int ikey = chosen - so->keyData; + ScanKey skipequalitykey = chosen; + BTArrayKeyInfo *array = NULL; + + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + array = &so->arrayKeys[arridx]; + if (array->scan_key == ikey) + break; + } + + if (ScanDirectionIsForward(dir)) + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); + chosen = array->low_compare; + } + else + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); + chosen = array->high_compare; + } + + Assert(chosen == NULL || + chosen->sk_attno == skipequalitykey->sk_attno); + + if (!array->null_elem) + impliesNN = skipequalitykey; + else + Assert(chosen == NULL && impliesNN == NULL); + } + + /* + * If we didn't find a usable boundary key, see if we can + * deduce a NOT NULL key */ if (chosen == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? @@ -1084,9 +1144,40 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) break; /* - * Done if that was the last attribute, or if next key is not - * in sequence (implying no boundary key is available for the - * next attribute). + * If the key that we just added to startKeys[] is a skip + * array = key whose current element is marked NEXT or PRIOR, + * make strat_total > or < (and stop adding boundary keys). + * This can only happen with opclasses that lack skip support. + */ + if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(strat_total == BTEqualStrategyNumber); + + if (ScanDirectionIsForward(dir)) + { + Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + strat_total = BTGreaterStrategyNumber; + } + else + { + Assert(!(chosen->sk_flags & SK_BT_NEXT)); + strat_total = BTLessStrategyNumber; + } + + /* + * We're done. We'll never find an exact = match for a + * NEXT or PRIOR sentinel sk_argument value. There's no + * sense in trying to add more keys to startKeys[]. + */ + break; + } + + /* + * Done if that was the last scan key output by preprocessing. + * Also done if there is a gap index attribute that lacks a + * usable key (only possible when preprocessing was unable to + * generate a skip array key to "fill in the gap"). */ if (i >= so->numberOfKeys || cur->sk_attno != curattr + 1) @@ -1581,31 +1672,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * We skip this for the first page read by each (primitive) scan, to avoid * slowing down point queries. They typically don't stand to gain much * when the optimization can be applied, and are more likely to notice the - * overhead of the precheck. - * - * The optimization is unsafe and must be avoided whenever _bt_checkkeys - * just set a low-order required array's key to the best available match - * for a truncated -inf attribute value from the prior page's high key - * (array element 0 is always the best available match in this scenario). - * It's quite likely that matches for array element 0 begin on this page, - * but the start of matches won't necessarily align with page boundaries. - * When the start of matches is somewhere in the middle of this page, it - * would be wrong to treat page's final non-pivot tuple as representative. - * Doing so might lead us to treat some of the page's earlier tuples as - * being part of a group of tuples thought to satisfy the required keys. - * - * Note: Conversely, in the case where the scan's arrays just advanced - * using the prior page's HIKEY _without_ advancement setting scanBehind, - * the start of matches must be aligned with page boundaries, which makes - * it safe to attempt the optimization here now. It's also safe when the - * prior page's HIKEY simply didn't need to advance any required array. In - * both cases we can safely assume that the _first_ tuple from this page - * must be >= the current set of array keys/equality constraints. And so - * if the final tuple is == those same keys (and also satisfies any - * required < or <= strategy scan keys) during the precheck, we can safely - * assume that this must also be true of all earlier tuples from the page. + * overhead of the precheck. Also avoid it during scans with array keys, + * which might be using skip scan (XXX fixed in next commit). */ - if (!pstate.firstpage && !so->scanBehind && minoff < maxoff) + if (!pstate.firstpage && !arrayKeys && minoff < maxoff) { ItemId iid; IndexTuple itup; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2aee9bbf67d2..108030a8ee7c 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -30,6 +30,17 @@ static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur); +static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static void _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull); +static void _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, @@ -207,6 +218,7 @@ _bt_compare_array_skey(FmgrInfo *orderproc, int32 result = 0; Assert(cur->sk_strategy == BTEqualStrategyNumber); + Assert(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))); if (tupnull) /* NULL tupdatum */ { @@ -283,6 +295,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum arrdatum; Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */ Assert(cur->sk_strategy == BTEqualStrategyNumber); if (cur_elem_trig) @@ -405,6 +419,186 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, return low_elem; } +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * Does not return an index into the array, since skip arrays don't really + * contain elements (they generate their array elements procedurally instead). + * Our interface matches that of _bt_binsrch_array_skey in every other way. + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. We return -1 when tupdatum/tupnull is lower that any value + * within the range of the array, and 1 when it is higher than every value. + * Caller should pass *set_elem_result to _bt_skiparray_set_element to advance + * the array. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key. We use this to optimize-away comparisons that are known by our + * caller to be unnecessary from context, just like _bt_binsrch_array_skey. + */ +static void +_bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + Assert(!ScanDirectionIsNoMovement(dir)); + + if (array->null_elem) + { + Assert(!array->low_compare && !array->high_compare); + + *set_elem_result = 0; + return; + } + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + *set_elem_result = -1; /* NULL "<" NOT_NULL */ + else + *set_elem_result = 1; /* NULL ">" NOT_NULL */ + return; + } + + /* + * Array inequalities determine whether tupdatum is within the range of + * caller's skip array + */ + *set_elem_result = 0; + if (ScanDirectionIsForward(dir)) + { + /* + * Evaluate low_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate high_compare + */ + if (!cur_elem_trig && array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + else if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + } + else + { + /* + * Evaluate high_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate low_compare + */ + if (!cur_elem_trig && array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + else if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + } + + /* + * Assert that any keys that were assumed to be satisfied already (due to + * caller passing cur_elem_trig=true) really are satisfied as expected + */ +#ifdef USE_ASSERT_CHECKING + if (cur_elem_trig) + { + if (ScanDirectionIsForward(dir) && array->low_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))); + + if (ScanDirectionIsBackward(dir) && array->high_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))); + } +#endif +} + +/* + * _bt_skiparray_set_element() -- Set skip array scan key's sk_argument + * + * Caller passes set_elem_result returned by _bt_binsrch_skiparray_skey for + * caller's tupdatum/tupnull. + * + * We copy tupdatum/tupnull into skey's sk_argument iff set_elem_result == 0. + * Otherwise, we set skey to either the lowest or highest value that's within + * the range of caller's skip array (whichever is the best available match to + * tupdatum/tupnull that is still within the range of the skip array according + * to _bt_binsrch_skiparray_skey/set_elem_result). + */ +static void +_bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (set_elem_result) + { + /* tupdatum/tupnull is out of the range of the skip array */ + Assert(!array->null_elem); + + _bt_array_set_low_or_high(rel, skey, array, set_elem_result < 0); + return; + } + + /* Advance skip array to tupdatum (or tupnull) value */ + if (unlikely(tupnull)) + { + _bt_skiparray_set_isnull(rel, skey, array); + return; + } + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* tupdatum becomes new sk_argument/new current element */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_argument = datumCopy(tupdatum, array->attbyval, array->attlen); +} + +/* + * _bt_skiparray_set_isnull() -- set skip array scan key to NULL + */ +static void +_bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->null_elem && !array->low_compare && !array->high_compare); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* NULL becomes new sk_argument/new current element */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -414,29 +608,355 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; Assert(so->numArrayKeys); Assert(so->qual_ok); - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[curArrayKey->scan_key]; + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; - Assert(curArrayKey->num_elems > 0); Assert(skey->sk_flags & SK_SEARCHARRAY); - if (ScanDirectionIsBackward(dir)) - curArrayKey->cur_elem = curArrayKey->num_elems - 1; - else - curArrayKey->cur_elem = 0; - skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); } so->scanBehind = so->oppositeDirCheck = false; /* reset */ } +/* + * _bt_array_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for SAOP array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Reset flags */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Requested element (either lowest or highest) has the value NULL */ + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else if (low_not_high) + { + /* Setting array to lowest element (according to low_compare) */ + skey->sk_flags |= SK_BT_MINVAL; + } + else + { + /* Setting array to highest element (according to high_compare) */ + skey->sk_flags |= SK_BT_MAXVAL; + } +} + +/* + * _bt_array_decrement() -- decrement array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully decremented. + * Cannot decrement an array whose current element is already the first one. + */ +static bool +_bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool uflow = false; + Datum dec_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MAXVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem > 0) + { + /* + * Just decrement current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem--; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully decremented array */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the minimum value within the range + * of a skip array (often just -inf) is never decrementable + */ + if (skey->sk_flags & SK_BT_MINVAL) + return false; + + /* + * When the current array element is NULL, and the lowest sorting value in + * the index is also NULL, we cannot decrement before first array element + */ + if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "decrement" the scan key's current + * element by setting the PRIOR flag. The true prior value is determined + * by repositioning to the last index tuple < existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "decremented" array */ + skey->sk_flags |= SK_BT_PRIOR; + return true; + } + + /* + * Opclasses with skip support directly decrement sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Decrement" from NULL to the high_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->high_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide decremented copy of existing + * non-NULL sk_argument + */ + dec_sk_argument = array->sksup->decrement(rel, skey->sk_argument, &uflow); + if (unlikely(uflow)) + { + /* dec_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "decremented" array to NULL */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* + * Successfully decremented sk_argument to a non-NULL value. Make sure + * that the decremented value is still within the range of the array. + */ + if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + dec_sk_argument, + array->low_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(dec_sk_argument)); + + /* Cannot decrement to before first array element */ + return false; + } + + /* Accept value returned by opclass decrement callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = dec_sk_argument; + + /* Successfully decremented array */ + return true; +} + +/* + * _bt_array_increment() -- increment array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully incremented. + * Cannot increment an array whose current element is already the final one. + */ +static bool +_bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool oflow = false; + Datum inc_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MINVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem < array->num_elems - 1) + { + /* + * Just increment current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem++; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully incremented array */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the maximum value within the range + * of a skip array (often just +inf) is never incrementable + */ + if (skey->sk_flags & SK_BT_MAXVAL) + return false; + + /* + * When the current array element is NULL, and the highest sorting value + * in the index is also NULL, we cannot increment past the final element + */ + if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "increment" the scan key's current + * element by setting the NEXT flag. The true next value is determined by + * repositioning to the first index tuple > existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "incremented" array */ + skey->sk_flags |= SK_BT_NEXT; + return true; + } + + /* + * Opclasses with skip support directly increment sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(skey->sk_flags & SK_BT_NULLS_FIRST); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Increment" from NULL to the low_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->low_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide incremented copy of existing + * non-NULL sk_argument + */ + inc_sk_argument = array->sksup->increment(rel, skey->sk_argument, &oflow); + if (unlikely(oflow)) + { + /* inc_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "incremented" array to NULL */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* + * Successfully incremented sk_argument to a non-NULL value. Make sure + * that the incremented value is still within the range of the array. + */ + if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + inc_sk_argument, + array->high_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(inc_sk_argument)); + + /* Cannot increment past final array element */ + return false; + } + + /* Accept value returned by opclass increment callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = inc_sk_argument; + + /* Successfully incremented array */ + return true; +} + /* * _bt_advance_array_keys_increment() -- Advance to next set of array elements * @@ -452,6 +972,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; /* @@ -461,29 +982,30 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) */ for (int i = so->numArrayKeys - 1; i >= 0; i--) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[curArrayKey->scan_key]; - int cur_elem = curArrayKey->cur_elem; - int num_elems = curArrayKey->num_elems; - bool rolled = false; + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; - if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) + if (ScanDirectionIsForward(dir)) { - cur_elem = 0; - rolled = true; + if (_bt_array_increment(rel, skey, array)) + return true; } - else if (ScanDirectionIsBackward(dir) && --cur_elem < 0) + else { - cur_elem = num_elems - 1; - rolled = true; + if (_bt_array_decrement(rel, skey, array)) + return true; } - curArrayKey->cur_elem = cur_elem; - skey->sk_argument = curArrayKey->elem_values[cur_elem]; - if (!rolled) - return true; + /* + * Couldn't increment (or decrement) array. Handle array roll over. + * + * Start over at the array's lowest sorting value (or its highest + * value, for backward scans)... + */ + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); - /* Need to advance next array key, if any */ + /* ...then increment (or decrement) next most significant array */ } /* @@ -507,7 +1029,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) } /* - * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays + * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required * * Called when _bt_advance_array_keys decides to start a new primitive index * scan on the basis of the current scan position being before the position @@ -539,10 +1061,15 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) * * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that * everybody got this right. + * + * Note: In practice almost all SAOP arrays are marked required during + * preprocessing (if necessary by generating skip arrays). It is hardly ever + * truly necessary to call here, but consistently doing so is simpler. */ static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int arrayidx = 0; @@ -550,7 +1077,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { ScanKey cur = so->keyData + ikey; BTArrayKeyInfo *array = NULL; - int first_elem_dir; if (!(cur->sk_flags & SK_SEARCHARRAY) || cur->sk_strategy != BTEqualStrategyNumber) @@ -562,16 +1088,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) continue; - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; + Assert(array->num_elems != -1); /* No non-required skip arrays */ - if (array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -696,9 +1216,77 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); - result = _bt_compare_array_skey(&so->orderProcs[ikey], - tupdatum, tupnull, - cur->sk_argument, cur); + if (likely(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))) + { + /* Scankey has a valid/comparable sk_argument value */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + + if (result == 0) + { + /* + * Interpret result in a way that takes NEXT/PRIOR into + * account + */ + if (cur->sk_flags & SK_BT_NEXT) + result = -1; + else if (cur->sk_flags & SK_BT_PRIOR) + result = 1; + + Assert(result == 0 || (cur->sk_flags & SK_BT_SKIP)); + } + } + else + { + BTArrayKeyInfo *array = NULL; + + /* + * Current array element/array = scan key value is a sentinel + * value that represents the lowest (or highest) possible value + * that's still within the range of the array. + * + * Like _bt_first, we only see MINVAL keys during forwards scans + * (and similarly only see MAXVAL keys during backwards scans). + * Even if the scan's direction changes, we'll stop at some higher + * order key before we can ever reach any MAXVAL (or MINVAL) keys. + * (However, unlike _bt_first we _can_ get to keys marked either + * NEXT or PRIOR, regardless of the scan's current direction.) + */ + Assert(ScanDirectionIsForward(dir) ? + !(cur->sk_flags & SK_BT_MAXVAL) : + !(cur->sk_flags & SK_BT_MINVAL)); + + /* + * There are no valid sk_argument values in MINVAL/MAXVAL keys. + * Check if tupdatum is within the range of skip array instead. + */ + for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) + { + array = &so->arrayKeys[arrayidx]; + if (array->scan_key == ikey) + break; + } + + _bt_binsrch_skiparray_skey(false, dir, tupdatum, tupnull, + array, cur, &result); + + if (result == 0) + { + /* + * tupdatum satisfies both low_compare and high_compare, so + * it's time to advance the array keys. + * + * Note: It's possible that the skip array will "advance" from + * its MINVAL (or MAXVAL) representation to an alternative, + * logically equivalent representation of the same value: a + * representation where the = key gets a valid datum in its + * sk_argument. This is only possible when low_compare uses + * the >= strategy (or high_compare uses the <= strategy). + */ + return false; + } + } /* * Does this comparison indicate that caller must _not_ advance the @@ -1017,18 +1605,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (beyond_end_advance) { - int final_elem_dir; - - if (ScanDirectionIsBackward(dir) || !array) - final_elem_dir = 0; - else - final_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != final_elem_dir) - { - array->cur_elem = final_elem_dir; - cur->sk_argument = array->elem_values[final_elem_dir]; - } + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); continue; } @@ -1053,18 +1632,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (!all_required_satisfied || cur->sk_attno > tupnatts) { - int first_elem_dir; - - if (ScanDirectionIsForward(dir) || !array) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); continue; } @@ -1080,14 +1650,22 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, bool cur_elem_trig = (sktrig_required && ikey == sktrig); /* - * Binary search for closest match that's available from the array + * "Binary search" by checking if tupdatum/tupnull are within the + * range of the skip array */ - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); + if (array->num_elems == -1) + _bt_binsrch_skiparray_skey(cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); - Assert(set_elem >= 0 && set_elem < array->num_elems); + /* + * Binary search for the closest match from the SAOP array + */ + else + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); } else { @@ -1163,11 +1741,21 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } } - /* Advance array keys, even when set_elem isn't an exact match */ - if (array && array->cur_elem != set_elem) + /* Advance array keys, even when we don't have an exact match */ + if (array) { - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; + if (array->num_elems == -1) + { + /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ + _bt_skiparray_set_element(rel, cur, array, result, + tupdatum, tupnull); + } + else if (array->cur_elem != set_elem) + { + /* SAOP array's new element is set_elem datum */ + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } } } @@ -1581,10 +2169,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->scan_key != ikey) return false; - if (array->num_elems <= 0) + if (array->num_elems == 0 || array->num_elems < -1) return false; - if (cur->sk_argument != array->elem_values[array->cur_elem]) + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) return false; if (last_sk_attno > cur->sk_attno) return false; @@ -1914,6 +2503,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, continue; } + /* + * A skip array scan key uses one of several sentinel values. We just + * fall back on _bt_tuple_before_array_skeys when we see such a value. + */ + if (key->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(key->sk_flags & SK_SEARCHARRAY); + Assert(key->sk_flags & SK_BT_SKIP); + + *continuescan = false; + return false; + } + /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { @@ -1939,6 +2542,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, else { Assert(key->sk_flags & SK_SEARCHNOTNULL); + Assert(!(key->sk_flags & SK_BT_SKIP)); if (!isNull) continue; /* tuple satisfies this qual */ } diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index dd6f5a15c653..817ad358f0ca 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -106,6 +106,10 @@ btvalidate(Oid opclassoid) case BTOPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case BTSKIPSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index 8546366ee06c..a6dd8eab5186 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -1331,6 +1331,31 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ordering equal image functions must not be cross-type"))); } + else if (member->number == BTSKIPSUPPORT_PROC) + { + if (procform->pronargs != 1 || + procform->proargtypes.values[0] != INTERNALOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must accept type \"internal\""))); + if (procform->prorettype != VOIDOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must return void"))); + + /* + * pg_amproc functions are indexed by (lefttype, righttype), but a + * skip support function doesn't make sense in cross-type + * scenarios. The same opclass opcintype OID is always used for + * lefttype and righttype. Providing a cross-type routine isn't + * sensible. Reject cross-type ALTER OPERATOR FAMILY ... ADD + * FUNCTION 6 statements here. + */ + if (member->lefttype != member->righttype) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must not be cross-type"))); + } } else if (GetIndexAmRoutineByAmId(amoid, false)->amcanhash) { diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 35e8c01aab94..4a233b63c328 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -99,6 +99,7 @@ OBJS = \ rowtypes.o \ ruleutils.o \ selfuncs.o \ + skipsupport.o \ tid.o \ timestamp.o \ trigfuncs.o \ diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index f279853deb80..4227ab1a72bf 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -34,6 +34,7 @@ #include "utils/date.h" #include "utils/datetime.h" #include "utils/numeric.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" /* @@ -462,6 +463,51 @@ date_sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +date_decrement(Relation rel, Datum existing, bool *underflow) +{ + DateADT dexisting = DatumGetDateADT(existing); + + if (dexisting == DATEVAL_NOBEGIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return DateADTGetDatum(dexisting - 1); +} + +static Datum +date_increment(Relation rel, Datum existing, bool *overflow) +{ + DateADT dexisting = DatumGetDateADT(existing); + + if (dexisting == DATEVAL_NOEND) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return DateADTGetDatum(dexisting + 1); +} + +Datum +date_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = date_decrement; + sksup->increment = date_increment; + sksup->low_elem = DateADTGetDatum(DATEVAL_NOBEGIN); + sksup->high_elem = DateADTGetDatum(DATEVAL_NOEND); + + PG_RETURN_VOID(); +} + Datum hashdate(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index f23cfad7182a..244f48f4fd71 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -86,6 +86,7 @@ backend_sources += files( 'rowtypes.c', 'ruleutils.c', 'selfuncs.c', + 'skipsupport.c', 'tid.c', 'timestamp.c', 'trigfuncs.c', diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5b35debc8ffd..385c20c62333 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -193,6 +193,8 @@ static double convert_timevalue_to_scalar(Datum value, Oid typid, bool *failure); static void examine_simple_variable(PlannerInfo *root, Var *var, VariableStatData *vardata); +static void examine_indexcol_variable(PlannerInfo *root, IndexOptInfo *index, + int indexcol, VariableStatData *vardata); static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Oid collation, Datum *min, Datum *max); @@ -214,6 +216,8 @@ static bool get_actual_variable_endpoint(Relation heapRel, MemoryContext outercontext, Datum *endpointDatum); static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids); +static double btcost_correlation(IndexOptInfo *index, + VariableStatData *vardata); /* @@ -5943,6 +5947,92 @@ examine_simple_variable(PlannerInfo *root, Var *var, } } +/* + * examine_indexcol_variable + * Try to look up statistical data about an index column/expression. + * Fill in a VariableStatData struct to describe the column. + * + * Inputs: + * root: the planner info + * index: the index whose column we're interested in + * indexcol: 0-based index column number (subscripts index->indexkeys[]) + * + * Outputs: *vardata is filled as follows: + * var: the input expression (with any binary relabeling stripped, if + * it is or contains a variable; but otherwise the type is preserved) + * rel: RelOptInfo for table relation containing variable. + * statsTuple: the pg_statistic entry for the variable, if one exists; + * otherwise NULL. + * freefunc: pointer to a function to release statsTuple with. + * + * Caller is responsible for doing ReleaseVariableStats() before exiting. + */ +static void +examine_indexcol_variable(PlannerInfo *root, IndexOptInfo *index, + int indexcol, VariableStatData *vardata) +{ + AttrNumber colnum; + Oid relid; + + if (index->indexkeys[indexcol] != 0) + { + /* Simple variable --- look to stats for the underlying table */ + RangeTblEntry *rte = planner_rt_fetch(index->rel->relid, root); + + Assert(rte->rtekind == RTE_RELATION); + relid = rte->relid; + Assert(relid != InvalidOid); + colnum = index->indexkeys[indexcol]; + vardata->rel = index->rel; + + if (get_relation_stats_hook && + (*get_relation_stats_hook) (root, rte, colnum, vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did + * supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + !vardata->freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata->statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(relid), + Int16GetDatum(colnum), + BoolGetDatum(rte->inh)); + vardata->freefunc = ReleaseSysCache; + } + } + else + { + /* Expression --- maybe there are stats for the index itself */ + relid = index->indexoid; + colnum = indexcol + 1; + + if (get_index_stats_hook && + (*get_index_stats_hook) (root, relid, colnum, vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did + * supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + !vardata->freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata->statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(relid), + Int16GetDatum(colnum), + BoolGetDatum(false)); + vardata->freefunc = ReleaseSysCache; + } + } +} + /* * Check whether it is permitted to call func_oid passing some of the * pg_statistic data in vardata. We allow this either if the user has SELECT @@ -7001,6 +7091,53 @@ add_predicate_to_index_quals(IndexOptInfo *index, List *indexQuals) return list_concat(predExtraQuals, indexQuals); } +/* + * Estimate correlation of btree index's first column. + * + * If we can get an estimate of the first column's ordering correlation C + * from pg_statistic, estimate the index correlation as C for a single-column + * index, or C * 0.75 for multiple columns. The idea here is that multiple + * columns dilute the importance of the first column's ordering, but don't + * negate it entirely. + * + * We already filled in the stats tuple for *vardata when called. + */ +static double +btcost_correlation(IndexOptInfo *index, VariableStatData *vardata) +{ + Oid sortop; + AttStatsSlot sslot; + double indexCorrelation = 0; + + Assert(HeapTupleIsValid(vardata->statsTuple)); + + sortop = get_opfamily_member(index->opfamily[0], + index->opcintype[0], + index->opcintype[0], + BTLessStrategyNumber); + if (OidIsValid(sortop) && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_CORRELATION, sortop, + ATTSTATSSLOT_NUMBERS)) + { + double varCorrelation; + + Assert(sslot.nnumbers == 1); + varCorrelation = sslot.numbers[0]; + + if (index->reverse_sort[0]) + varCorrelation = -varCorrelation; + + if (index->nkeycolumns > 1) + indexCorrelation = varCorrelation * 0.75; + else + indexCorrelation = varCorrelation; + + free_attstatsslot(&sslot); + } + + return indexCorrelation; +} void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, @@ -7010,17 +7147,19 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { IndexOptInfo *index = path->indexinfo; GenericCosts costs = {0}; - Oid relid; - AttrNumber colnum; VariableStatData vardata = {0}; double numIndexTuples; Cost descentCost; List *indexBoundQuals; + List *indexSkipQuals; int indexcol; bool eqQualHere; - bool found_saop; + bool found_row_compare; + bool found_array; bool found_is_null_op; + bool have_correlation = false; double num_sa_scans; + double correlation = 0.0; ListCell *lc; /* @@ -7031,19 +7170,24 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * it's OK to count them in indexSelectivity, but they should not count * for estimating numIndexTuples. So we must examine the given indexquals * to find out which ones count as boundary quals. We rely on the - * knowledge that they are given in index column order. + * knowledge that they are given in index column order. Note that nbtree + * preprocessing can add skip arrays that act as leading '=' quals in the + * absence of ordinary input '=' quals, so in practice _most_ input quals + * are able to act as index bound quals (which we take into account here). * * For a RowCompareExpr, we consider only the first column, just as * rowcomparesel() does. * - * If there's a ScalarArrayOpExpr in the quals, we'll actually perform up - * to N index descents (not just one), but the ScalarArrayOpExpr's + * If there's a SAOP or skip array in the quals, we'll actually perform up + * to N index descents (not just one), but the underlying array key's * operator can be considered to act the same as it normally does. */ indexBoundQuals = NIL; + indexSkipQuals = NIL; indexcol = 0; eqQualHere = false; - found_saop = false; + found_row_compare = false; + found_array = false; found_is_null_op = false; num_sa_scans = 1; foreach(lc, path->indexclauses) @@ -7051,17 +7195,203 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, IndexClause *iclause = lfirst_node(IndexClause, lc); ListCell *lc2; - if (indexcol != iclause->indexcol) + if (indexcol < iclause->indexcol) { - /* Beginning of a new column's quals */ - if (!eqQualHere) - break; /* done if no '=' qual for indexcol */ + double num_sa_scans_prev_cols = num_sa_scans; + + /* + * Beginning of a new column's quals. + * + * Skip scans use skip arrays, which are ScalarArrayOp style + * arrays that generate their elements procedurally and on demand. + * Given a composite index on "(a, b)", and an SQL WHERE clause + * "WHERE b = 42", a skip scan will effectively use an indexqual + * "WHERE a = ANY('{every col a value}') AND b = 42". (Obviously, + * the array on "a" must also return "IS NULL" matches, since our + * WHERE clause used no strict operator on "a"). + * + * Here we consider how nbtree will backfill skip arrays for any + * index columns that lacked an '=' qual. This maintains our + * num_sa_scans estimate, and determines if this new column (the + * "iclause->indexcol" column, not the prior "indexcol" column) + * can have its RestrictInfos/quals added to indexBoundQuals. + * + * We'll need to handle columns that have inequality quals, where + * the skip array generates values from a range constrained by the + * quals (not every possible value). We've been maintaining + * indexSkipQuals to help with this; it will now contain all of + * the prior column's quals (that is, indexcol's quals) when they + * might be used for this. + */ + if (found_row_compare) + { + /* + * Skip arrays can't be added after a RowCompare input qual + * due to limitations in nbtree + */ + break; + } + if (eqQualHere) + { + /* + * Don't need to add a skip array for an indexcol that already + * has an '=' qual/equality constraint + */ + indexcol++; + indexSkipQuals = NIL; + } eqQualHere = false; - indexcol++; + + while (indexcol < iclause->indexcol) + { + double ndistinct; + bool isdefault = true; + + found_array = true; + + /* + * A skipped attribute's ndistinct forms the basis of our + * estimate of the total number of "array elements" used by + * its skip array at runtime. Look that up first. + */ + examine_indexcol_variable(root, index, indexcol, &vardata); + ndistinct = get_variable_numdistinct(&vardata, &isdefault); + + if (indexcol == 0) + { + /* + * Get an estimate of the leading column's correlation in + * passing (avoids rereading variable stats below) + */ + if (HeapTupleIsValid(vardata.statsTuple)) + correlation = btcost_correlation(index, &vardata); + have_correlation = true; + } + + ReleaseVariableStats(vardata); + + /* + * If ndistinct is a default estimate, conservatively assume + * that no skipping will happen at runtime + */ + if (isdefault) + { + num_sa_scans = num_sa_scans_prev_cols; + break; /* done building indexBoundQuals */ + } + + /* + * Apply indexcol's indexSkipQuals selectivity to ndistinct + */ + if (indexSkipQuals != NIL) + { + List *partialSkipQuals; + Selectivity ndistinctfrac; + + /* + * If the index is partial, AND the index predicate with + * the index-bound quals to produce a more accurate idea + * of the number of distinct values for prior indexcol + */ + partialSkipQuals = add_predicate_to_index_quals(index, + indexSkipQuals); + + ndistinctfrac = clauselist_selectivity(root, partialSkipQuals, + index->rel->relid, + JOIN_INNER, + NULL); + + /* + * If ndistinctfrac is selective (on its own), the scan is + * unlikely to benefit from repositioning itself using + * later quals. Do not allow iclause->indexcol's quals to + * be added to indexBoundQuals (it would increase descent + * costs, without lowering numIndexTuples costs by much). + */ + if (ndistinctfrac < DEFAULT_RANGE_INEQ_SEL) + { + num_sa_scans = num_sa_scans_prev_cols; + break; /* done building indexBoundQuals */ + } + + /* Adjust ndistinct downward */ + ndistinct = rint(ndistinct * ndistinctfrac); + ndistinct = Max(ndistinct, 1); + } + + /* + * When there's no inequality quals, account for the need to + * find an initial value by counting -inf/+inf as a value. + * + * We don't charge anything extra for possible next/prior key + * index probes, which are sometimes used to find the next + * valid skip array element (ahead of using the located + * element value to relocate the scan to the next position + * that might contain matching tuples). It seems hard to do + * better here. Use of the skip support infrastructure often + * avoids most next/prior key probes. But even when it can't, + * there's a decent chance that most individual next/prior key + * probes will locate a leaf page whose key space overlaps all + * of the scan's keys (even the lower-order keys) -- which + * also avoids the need for a separate, extra index descent. + * Note also that these probes are much cheaper than non-probe + * primitive index scans: they're reliably very selective. + */ + if (indexSkipQuals == NIL) + ndistinct += 1; + + /* + * Update num_sa_scans estimate by multiplying by ndistinct. + * + * We make the pessimistic assumption that there is no + * naturally occurring cross-column correlation. This is + * often wrong, but it seems best to err on the side of not + * expecting skipping to be helpful... + */ + num_sa_scans *= ndistinct; + + /* + * ...but back out of adding this latest group of 1 or more + * skip arrays when num_sa_scans exceeds the total number of + * index pages (revert to num_sa_scans from before indexcol). + * This causes a sharp discontinuity in cost (as a function of + * the indexcol's ndistinct), but that is representative of + * actual runtime costs. + * + * Note that skipping is helpful when each primitive index + * scan only manages to skip over 1 or 2 irrelevant leaf pages + * on average. Skip arrays bring savings in CPU costs due to + * the scan not needing to evaluate indexquals against every + * tuple, which can greatly exceed any savings in I/O costs. + * This test is a test of whether num_sa_scans implies that + * we're past the point where the ability to skip ceases to + * lower the scan's costs (even qual evaluation CPU costs). + */ + if (index->pages < num_sa_scans) + { + num_sa_scans = num_sa_scans_prev_cols; + break; /* done building indexBoundQuals */ + } + + indexcol++; + indexSkipQuals = NIL; + } + + /* + * Finished considering the need to add skip arrays to bridge an + * initial eqQualHere gap between the old and new index columns + * (or there was no initial eqQualHere gap in the first place). + * + * If an initial gap could not be bridged, then new column's quals + * (i.e. iclause->indexcol's quals) won't go into indexBoundQuals, + * and so won't affect our final numIndexTuples estimate. + */ if (indexcol != iclause->indexcol) - break; /* no quals at all for indexcol */ + break; /* done building indexBoundQuals */ } + Assert(indexcol == iclause->indexcol); + /* Examine each indexqual associated with this index clause */ foreach(lc2, iclause->indexquals) { @@ -7081,6 +7411,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, RowCompareExpr *rc = (RowCompareExpr *) clause; clause_op = linitial_oid(rc->opnos); + found_row_compare = true; } else if (IsA(clause, ScalarArrayOpExpr)) { @@ -7089,7 +7420,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, double alength = estimate_array_length(root, other_operand); clause_op = saop->opno; - found_saop = true; + found_array = true; /* estimate SA descents by indexBoundQuals only */ if (alength > 1) num_sa_scans *= alength; @@ -7101,7 +7432,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, if (nt->nulltesttype == IS_NULL) { found_is_null_op = true; - /* IS NULL is like = for selectivity purposes */ + /* IS NULL is like = for selectivity/skip scan purposes */ eqQualHere = true; } } @@ -7120,19 +7451,28 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, } indexBoundQuals = lappend(indexBoundQuals, rinfo); + + /* + * We apply inequality selectivities to estimate index descent + * costs with scans that use skip arrays. Save this indexcol's + * RestrictInfos if it looks like they'll be needed for that. + */ + if (!eqQualHere && !found_row_compare && + indexcol < index->nkeycolumns - 1) + indexSkipQuals = lappend(indexSkipQuals, rinfo); } } /* * If index is unique and we found an '=' clause for each column, we can * just assume numIndexTuples = 1 and skip the expensive - * clauselist_selectivity calculations. However, a ScalarArrayOp or - * NullTest invalidates that theory, even though it sets eqQualHere. + * clauselist_selectivity calculations. However, an array or NullTest + * always invalidates that theory (even when eqQualHere has been set). */ if (index->unique && indexcol == index->nkeycolumns - 1 && eqQualHere && - !found_saop && + !found_array && !found_is_null_op) numIndexTuples = 1.0; else @@ -7154,7 +7494,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, numIndexTuples = btreeSelectivity * index->rel->tuples; /* - * btree automatically combines individual ScalarArrayOpExpr primitive + * btree automatically combines individual array element primitive * index scans whenever the tuples covered by the next set of array * keys are close to tuples covered by the current set. That puts a * natural ceiling on the worst case number of descents -- there @@ -7172,16 +7512,18 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * of leaf pages (we make it 1/3 the total number of pages instead) to * give the btree code credit for its ability to continue on the leaf * level with low selectivity scans. + * + * Note: num_sa_scans includes both ScalarArrayOp array elements and + * skip array elements whose qual affects our numIndexTuples estimate. */ num_sa_scans = Min(num_sa_scans, ceil(index->pages * 0.3333333)); num_sa_scans = Max(num_sa_scans, 1); /* - * As in genericcostestimate(), we have to adjust for any - * ScalarArrayOpExpr quals included in indexBoundQuals, and then round - * to integer. + * As in genericcostestimate(), we have to adjust for any array quals + * included in indexBoundQuals, and then round to integer. * - * It is tempting to make genericcostestimate behave as if SAOP + * It is tempting to make genericcostestimate behave as if array * clauses work in almost the same way as scalar operators during * btree scans, making the top-level scan look like a continuous scan * (as opposed to num_sa_scans-many primitive index scans). After @@ -7214,7 +7556,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * comparisons to descend a btree of N leaf tuples. We charge one * cpu_operator_cost per comparison. * - * If there are ScalarArrayOpExprs, charge this once per estimated SA + * If there are SAOP/skip array keys, charge this once per estimated SA * index descent. The ones after the first one are not startup cost so * far as the overall plan goes, so just add them to "total" cost. */ @@ -7234,110 +7576,25 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page * touched. The number of such pages is btree tree height plus one (ie, * we charge for the leaf page too). As above, charge once per estimated - * SA index descent. + * SAOP/skip array descent. */ descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost; costs.indexStartupCost += descentCost; costs.indexTotalCost += costs.num_sa_scans * descentCost; - /* - * If we can get an estimate of the first column's ordering correlation C - * from pg_statistic, estimate the index correlation as C for a - * single-column index, or C * 0.75 for multiple columns. (The idea here - * is that multiple columns dilute the importance of the first column's - * ordering, but don't negate it entirely. Before 8.0 we divided the - * correlation by the number of columns, but that seems too strong.) - */ - if (index->indexkeys[0] != 0) + if (!have_correlation) { - /* Simple variable --- look to stats for the underlying table */ - RangeTblEntry *rte = planner_rt_fetch(index->rel->relid, root); - - Assert(rte->rtekind == RTE_RELATION); - relid = rte->relid; - Assert(relid != InvalidOid); - colnum = index->indexkeys[0]; - - if (get_relation_stats_hook && - (*get_relation_stats_hook) (root, rte, colnum, &vardata)) - { - /* - * The hook took control of acquiring a stats tuple. If it did - * supply a tuple, it'd better have supplied a freefunc. - */ - if (HeapTupleIsValid(vardata.statsTuple) && - !vardata.freefunc) - elog(ERROR, "no function provided to release variable stats with"); - } - else - { - vardata.statsTuple = SearchSysCache3(STATRELATTINH, - ObjectIdGetDatum(relid), - Int16GetDatum(colnum), - BoolGetDatum(rte->inh)); - vardata.freefunc = ReleaseSysCache; - } + examine_indexcol_variable(root, index, 0, &vardata); + if (HeapTupleIsValid(vardata.statsTuple)) + costs.indexCorrelation = btcost_correlation(index, &vardata); + ReleaseVariableStats(vardata); } else { - /* Expression --- maybe there are stats for the index itself */ - relid = index->indexoid; - colnum = 1; - - if (get_index_stats_hook && - (*get_index_stats_hook) (root, relid, colnum, &vardata)) - { - /* - * The hook took control of acquiring a stats tuple. If it did - * supply a tuple, it'd better have supplied a freefunc. - */ - if (HeapTupleIsValid(vardata.statsTuple) && - !vardata.freefunc) - elog(ERROR, "no function provided to release variable stats with"); - } - else - { - vardata.statsTuple = SearchSysCache3(STATRELATTINH, - ObjectIdGetDatum(relid), - Int16GetDatum(colnum), - BoolGetDatum(false)); - vardata.freefunc = ReleaseSysCache; - } + /* btcost_correlation already called earlier on */ + costs.indexCorrelation = correlation; } - if (HeapTupleIsValid(vardata.statsTuple)) - { - Oid sortop; - AttStatsSlot sslot; - - sortop = get_opfamily_member(index->opfamily[0], - index->opcintype[0], - index->opcintype[0], - BTLessStrategyNumber); - if (OidIsValid(sortop) && - get_attstatsslot(&sslot, vardata.statsTuple, - STATISTIC_KIND_CORRELATION, sortop, - ATTSTATSSLOT_NUMBERS)) - { - double varCorrelation; - - Assert(sslot.nnumbers == 1); - varCorrelation = sslot.numbers[0]; - - if (index->reverse_sort[0]) - varCorrelation = -varCorrelation; - - if (index->nkeycolumns > 1) - costs.indexCorrelation = varCorrelation * 0.75; - else - costs.indexCorrelation = varCorrelation; - - free_attstatsslot(&sslot); - } - } - - ReleaseVariableStats(vardata); - *indexStartupCost = costs.indexStartupCost; *indexTotalCost = costs.indexTotalCost; *indexSelectivity = costs.indexSelectivity; diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c new file mode 100644 index 000000000000..2bd35d2d2722 --- /dev/null +++ b/src/backend/utils/adt/skipsupport.c @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.c + * Support routines for B-Tree skip scan. + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/skipsupport.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "utils/lsyscache.h" +#include "utils/skipsupport.h" + +/* + * Fill in SkipSupport given an operator class (opfamily + opcintype). + * + * On success, returns skip support struct, allocating in caller's memory + * context. Otherwise returns NULL, indicating that operator class has no + * skip support function. + */ +SkipSupport +PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse) +{ + Oid skipSupportFunction; + SkipSupport sksup; + + /* Look for a skip support function */ + skipSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype, + BTSKIPSUPPORT_PROC); + if (!OidIsValid(skipSupportFunction)) + return NULL; + + sksup = palloc(sizeof(SkipSupportData)); + OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup)); + + if (reverse) + { + /* + * DESC/reverse case: swap low_elem with high_elem, and swap decrement + * with increment + */ + Datum low_elem = sksup->low_elem; + SkipSupportIncDec decrement = sksup->decrement; + + sksup->low_elem = sksup->high_elem; + sksup->decrement = sksup->increment; + + sksup->high_elem = low_elem; + sksup->increment = decrement; + } + + return sksup; +} diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 9682f9dbdca1..347089b76264 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -37,6 +37,7 @@ #include "utils/datetime.h" #include "utils/float.h" #include "utils/numeric.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" /* @@ -2304,6 +2305,53 @@ timestamp_sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* note: this is used for timestamptz also */ +static Datum +timestamp_decrement(Relation rel, Datum existing, bool *underflow) +{ + Timestamp texisting = DatumGetTimestamp(existing); + + if (texisting == PG_INT64_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return TimestampGetDatum(texisting - 1); +} + +/* note: this is used for timestamptz also */ +static Datum +timestamp_increment(Relation rel, Datum existing, bool *overflow) +{ + Timestamp texisting = DatumGetTimestamp(existing); + + if (texisting == PG_INT64_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return TimestampGetDatum(texisting + 1); +} + +Datum +timestamp_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = timestamp_decrement; + sksup->increment = timestamp_increment; + sksup->low_elem = TimestampGetDatum(PG_INT64_MIN); + sksup->high_elem = TimestampGetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum timestamp_hash(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index be0f0f9f1ce4..bce7309c1833 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -13,6 +13,7 @@ #include "postgres.h" +#include #include /* for clock_gettime() */ #include "common/hashfn.h" @@ -21,6 +22,7 @@ #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" #include "utils/guc.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" #include "utils/uuid.h" @@ -418,6 +420,74 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) return res; } +static Datum +uuid_decrement(Relation rel, Datum existing, bool *underflow) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] > 0) + { + uuid->data[i]--; + *underflow = false; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = UCHAR_MAX; + } + + pfree(uuid); /* cannot leak memory */ + + /* return value is undefined */ + *underflow = true; + return (Datum) 0; +} + +static Datum +uuid_increment(Relation rel, Datum existing, bool *overflow) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] < UCHAR_MAX) + { + uuid->data[i]++; + *overflow = false; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = 0; + } + + pfree(uuid); /* cannot leak memory */ + + /* return value is undefined */ + *overflow = true; + return (Datum) 0; +} + +Datum +uuid_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + pg_uuid_t *uuid_min = palloc(UUID_LEN); + pg_uuid_t *uuid_max = palloc(UUID_LEN); + + memset(uuid_min->data, 0x00, UUID_LEN); + memset(uuid_max->data, 0xFF, UUID_LEN); + + sksup->decrement = uuid_decrement; + sksup->increment = uuid_increment; + sksup->low_elem = UUIDPGetDatum(uuid_min); + sksup->high_elem = UUIDPGetDatum(uuid_max); + + PG_RETURN_VOID(); +} + /* hash index support */ Datum uuid_hash(PG_FUNCTION_ARGS) diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index c4a0737731f2..52916bab7a31 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -214,7 +214,8 @@ typedef void (*amrestrpos_function) (IndexScanDesc scan); */ /* estimate size of parallel scan descriptor */ -typedef Size (*amestimateparallelscan_function) (int nkeys, int norderbys); +typedef Size (*amestimateparallelscan_function) (Relation indexRelation, + int nkeys, int norderbys); /* prepare for parallel index scan */ typedef void (*aminitparallelscan_function) (void *target); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index faabcb78e7b7..b86bf7bf37e0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,7 @@ #include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/shm_toc.h" +#include "utils/skipsupport.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -707,6 +708,10 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup) * (BTOPTIONS_PROC). These procedures define a set of user-visible * parameters that can be used to control operator class behavior. None of * the built-in B-Tree operator classes currently register an "options" proc. + * + * To facilitate more efficient B-Tree skip scans, an operator class may + * choose to offer a sixth amproc procedure (BTSKIPSUPPORT_PROC). For full + * details, see src/include/utils/skipsupport.h. */ #define BTORDER_PROC 1 @@ -714,7 +719,8 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup) #define BTINRANGE_PROC 3 #define BTEQUALIMAGE_PROC 4 #define BTOPTIONS_PROC 5 -#define BTNProcs 5 +#define BTSKIPSUPPORT_PROC 6 +#define BTNProcs 6 /* * We need to be able to tell the difference between read and write @@ -1027,10 +1033,21 @@ typedef BTScanPosData *BTScanPos; /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { + /* fields used by both kinds of array (standard arrays and skip arrays) */ int scan_key; /* index of associated key in keyData */ - int cur_elem; /* index of current element in elem_values */ - int num_elems; /* number of elems in current array value */ + int num_elems; /* number of elems (-1 for skip array) */ + + /* fields for ScalarArrayOpExpr arrays */ Datum *elem_values; /* array of num_elems Datums */ + int cur_elem; /* index of current element in elem_values */ + + /* fields for skip arrays, which generate element datums procedurally */ + int16 attlen; /* attr's length, in bytes */ + bool attbyval; /* attr's FormData_pg_attribute.attbyval */ + bool null_elem; /* NULL is lowest/highest element? */ + SkipSupport sksup; /* skip support (NULL if opclass lacks it) */ + ScanKey low_compare; /* array's > or >= lower bound */ + ScanKey high_compare; /* array's < or <= upper bound */ } BTArrayKeyInfo; typedef struct BTScanOpaqueData @@ -1119,6 +1136,15 @@ typedef struct BTReadPageState */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_SKIP 0x00040000 /* skip array on column without input = */ + +/* SK_BT_SKIP-only flags (set and unset by array advancement) */ +#define SK_BT_MINVAL 0x00080000 /* invalid sk_argument, use low_compare */ +#define SK_BT_MAXVAL 0x00100000 /* invalid sk_argument, use high_compare */ +#define SK_BT_NEXT 0x00200000 /* positions the scan > sk_argument */ +#define SK_BT_PRIOR 0x00400000 /* positions the scan < sk_argument */ + +/* Remaps pg_index flag bits to uppermost SK_BT_* byte */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1165,7 +1191,7 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, bool indexUnchanged, struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); -extern Size btestimateparallelscan(int nkeys, int norderbys); +extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 41056171059e..925051489984 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -21,6 +21,8 @@ amprocrighttype => 'bit', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '1', amproc => 'btboolcmp' }, +{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', + amprocrighttype => 'bool', amprocnum => '6', amproc => 'btboolskipsupport' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bpchar_ops', amproclefttype => 'bpchar', @@ -41,12 +43,16 @@ amprocrighttype => 'char', amprocnum => '1', amproc => 'btcharcmp' }, { amprocfamily => 'btree/char_ops', amproclefttype => 'char', amprocrighttype => 'char', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/char_ops', amproclefttype => 'char', + amprocrighttype => 'char', amprocnum => '6', amproc => 'btcharskipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '1', amproc => 'date_cmp' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '2', amproc => 'date_sortsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', + amprocrighttype => 'date', amprocnum => '6', amproc => 'date_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'timestamp', amprocnum => '1', amproc => 'date_cmp_timestamp' }, @@ -60,6 +66,9 @@ amproc => 'timestamp_sortsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamp', amprocrighttype => 'timestamp', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamp', + amprocrighttype => 'timestamp', amprocnum => '6', + amproc => 'timestamp_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamp', amprocrighttype => 'date', amprocnum => '1', amproc => 'timestamp_cmp_date' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamp', @@ -74,6 +83,9 @@ { amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamptz', amprocrighttype => 'timestamptz', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamptz', + amprocrighttype => 'timestamptz', amprocnum => '6', + amproc => 'timestamp_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'timestamptz', amprocrighttype => 'date', amprocnum => '1', amproc => 'timestamptz_cmp_date' }, @@ -122,6 +134,8 @@ amprocrighttype => 'int2', amprocnum => '2', amproc => 'btint2sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', + amprocrighttype => 'int2', amprocnum => '6', amproc => 'btint2skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint24cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', @@ -141,6 +155,8 @@ amprocrighttype => 'int4', amprocnum => '2', amproc => 'btint4sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', + amprocrighttype => 'int4', amprocnum => '6', amproc => 'btint4skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int8', amprocnum => '1', amproc => 'btint48cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', @@ -160,6 +176,8 @@ amprocrighttype => 'int8', amprocnum => '2', amproc => 'btint8sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int8', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', + amprocrighttype => 'int8', amprocnum => '6', amproc => 'btint8skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint84cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', @@ -193,6 +211,8 @@ amprocrighttype => 'oid', amprocnum => '2', amproc => 'btoidsortsupport' }, { amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', amprocrighttype => 'oid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', + amprocrighttype => 'oid', amprocnum => '6', amproc => 'btoidskipsupport' }, { amprocfamily => 'btree/oidvector_ops', amproclefttype => 'oidvector', amprocrighttype => 'oidvector', amprocnum => '1', amproc => 'btoidvectorcmp' }, @@ -261,6 +281,8 @@ amprocrighttype => 'uuid', amprocnum => '2', amproc => 'uuid_sortsupport' }, { amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', amprocrighttype => 'uuid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', + amprocrighttype => 'uuid', amprocnum => '6', amproc => 'uuid_skipsupport' }, { amprocfamily => 'btree/record_ops', amproclefttype => 'record', amprocrighttype => 'record', amprocnum => '1', amproc => 'btrecordcmp' }, { amprocfamily => 'btree/record_image_ops', amproclefttype => 'record', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a28a15993a2c..5d5be8ba4e16 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1004,18 +1004,27 @@ { oid => '3129', descr => 'sort support', proname => 'btint2sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2sortsupport' }, +{ oid => '9290', descr => 'skip support', + proname => 'btint2skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint2skipsupport' }, { oid => '351', descr => 'less-equal-greater', proname => 'btint4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int4 int4', prosrc => 'btint4cmp' }, { oid => '3130', descr => 'sort support', proname => 'btint4sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4sortsupport' }, +{ oid => '9291', descr => 'skip support', + proname => 'btint4skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint4skipsupport' }, { oid => '842', descr => 'less-equal-greater', proname => 'btint8cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int8 int8', prosrc => 'btint8cmp' }, { oid => '3131', descr => 'sort support', proname => 'btint8sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8sortsupport' }, +{ oid => '9292', descr => 'skip support', + proname => 'btint8skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint8skipsupport' }, { oid => '354', descr => 'less-equal-greater', proname => 'btfloat4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'float4 float4', prosrc => 'btfloat4cmp' }, @@ -1034,12 +1043,18 @@ { oid => '3134', descr => 'sort support', proname => 'btoidsortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidsortsupport' }, +{ oid => '9293', descr => 'skip support', + proname => 'btoidskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btoidskipsupport' }, { oid => '404', descr => 'less-equal-greater', proname => 'btoidvectorcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'oidvector oidvector', prosrc => 'btoidvectorcmp' }, { oid => '358', descr => 'less-equal-greater', proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'char char', prosrc => 'btcharcmp' }, +{ oid => '9294', descr => 'skip support', + proname => 'btcharskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btcharskipsupport' }, { oid => '359', descr => 'less-equal-greater', proname => 'btnamecmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'name name', prosrc => 'btnamecmp' }, @@ -2300,6 +2315,9 @@ { oid => '3136', descr => 'sort support', proname => 'date_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_sortsupport' }, +{ oid => '9295', descr => 'skip support', + proname => 'date_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'date_skipsupport' }, { oid => '4133', descr => 'window RANGE support', proname => 'in_range', prorettype => 'bool', proargtypes => 'date date interval bool bool', @@ -4497,6 +4515,9 @@ { oid => '1693', descr => 'less-equal-greater', proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'bool bool', prosrc => 'btboolcmp' }, +{ oid => '9296', descr => 'skip support', + proname => 'btboolskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btboolskipsupport' }, { oid => '1688', descr => 'hash', proname => 'time_hash', prorettype => 'int4', proargtypes => 'time', @@ -6376,6 +6397,9 @@ { oid => '3137', descr => 'sort support', proname => 'timestamp_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'timestamp_sortsupport' }, +{ oid => '9297', descr => 'skip support', + proname => 'timestamp_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'timestamp_skipsupport' }, { oid => '4134', descr => 'window RANGE support', proname => 'in_range', prorettype => 'bool', @@ -9431,6 +9455,9 @@ { oid => '3300', descr => 'sort support', proname => 'uuid_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_sortsupport' }, +{ oid => '9298', descr => 'skip support', + proname => 'uuid_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'uuid_skipsupport' }, { oid => '2961', descr => 'I/O', proname => 'uuid_recv', prorettype => 'uuid', proargtypes => 'internal', prosrc => 'uuid_recv' }, diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h new file mode 100644 index 000000000000..bc51847cf617 --- /dev/null +++ b/src/include/utils/skipsupport.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.h + * Support routines for B-Tree skip scan. + * + * B-Tree operator classes for discrete types can optionally provide a support + * function for skipping. This is used during skip scans. + * + * A B-tree operator class that implements skip support provides B-tree index + * scans with a way of enumerating and iterating through every possible value + * from the domain of indexable values. This gives scans a way to determine + * the next value in line for a given skip array/scan key/skipped attribute. + * Scans request the next (or previous) value whenever they run out of tuples + * matching the skip array's current element value. The next (or previous) + * value can be used to relocate the scan; it is applied in combination with + * at least one additional lower-order non-skip key, taken from the query. + * + * Skip support is used by discrete type (e.g., integer and date) opclasses. + * Indexes with an attribute whose input opclass is of one of these types tend + * to store adjacent values in adjoining groups of index tuples. Each time a + * skip scan with skip support successfully guesses that the next value in the + * index (for a given skipped column) is indeed the value that skip support + * just incremented its skip array to, it will have saved the scan some work. + * The scan will have avoided an index probe that directly finds the next + * value that appears in the index. (When skip support guesses wrong, then it + * won't have saved any work, but it also won't have added any useless work. + * The failed attempt to locate exactly-matching index tuples acts just like + * an explicit probe would; it'll still find the index's true next value.) + * + * It usually isn't feasible to implement skip support for an opclass whose + * input type is continuous. The B-Tree code falls back on next-key sentinel + * values for any opclass that doesn't provide its own skip support function. + * This isn't really an implementation restriction; there is no benefit to + * providing skip support for an opclass where guessing that the next indexed + * value is the next possible indexable value never (or hardly ever) works out. + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/skipsupport.h + * + *------------------------------------------------------------------------- + */ +#ifndef SKIPSUPPORT_H +#define SKIPSUPPORT_H + +#include "utils/relcache.h" + +typedef struct SkipSupportData *SkipSupport; +typedef Datum (*SkipSupportIncDec) (Relation rel, + Datum existing, + bool *overflow); + +/* + * State/callbacks used by skip arrays to procedurally generate elements. + * + * A BTSKIPSUPPORT_PROC function must set each and every field when called + * (there are no optional fields). + */ +typedef struct SkipSupportData +{ + /* + * low_elem and high_elem must be set with the lowest and highest possible + * values from the domain of indexable values (assuming ascending order) + */ + Datum low_elem; /* lowest sorting/leftmost non-NULL value */ + Datum high_elem; /* highest sorting/rightmost non-NULL value */ + + /* + * Decrement/increment functions. + * + * Returns a decremented/incremented copy of caller's existing datum, + * allocated in caller's memory context (for pass-by-reference types). + * It's not okay for these functions to leak any memory. + * + * When the decrement function (or increment function) is called with a + * value that already matches low_elem (or high_elem), function must set + * the *overflow argument. The return value is treated as undefined by + * the B-Tree code; it shouldn't need to be (and won't be) pfree'd. + * + * The B-Tree code's "existing" datum argument is often just a straight + * copy of a value from an index tuple. Operator classes must accept + * every possible representational variation within the underlying type. + * On the other hand, opclasses are _not_ required to preserve information + * that doesn't affect how datums are sorted (e.g., skip support for a + * fixed precision numeric type needn't preserve datum display scale). + * Operator class decrement/increment functions will never be called with + * a NULL "existing" argument, either. + */ + SkipSupportIncDec decrement; + SkipSupportIncDec increment; +} SkipSupportData; + +extern SkipSupport PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, + bool reverse); + +#endif /* SKIPSUPPORT_H */ diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index 0c274d56a04d..23bf33f10a91 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -362,9 +362,9 @@ ERROR: invalid operator number 0, must be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ERROR: operator argument types must be specified in ALTER OPERATOR FAMILY ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ERROR: invalid function number 0, must be between 1 and 5 -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 -ERROR: invalid function number 6, must be between 1 and 5 +ERROR: invalid function number 0, must be between 1 and 6 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 +ERROR: invalid function number 7, must be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; @@ -505,6 +505,10 @@ ALTER OPERATOR FAMILY alt_opf18 USING btree ADD ALTER OPERATOR FAMILY alt_opf18 USING btree ADD FUNCTION 4 (int4, int2) btequalimage(oid); ERROR: ordering equal image functions must not be cross-type +-- Should fail. Not allowed to have cross-type skip support function. +ALTER OPERATOR FAMILY alt_opf18 USING btree + ADD FUNCTION 6 (int4, int2) btint4skipsupport(internal); +ERROR: btree skip support functions must not be cross-type ALTER OPERATOR FAMILY alt_opf18 USING btree DROP FUNCTION 2 (int4, int4); ERROR: function 2(integer,integer) does not exist in operator family "alt_opf18" DROP OPERATOR FAMILY alt_opf18 USING btree; diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 8879554c3f7c..bfb1a286ea4a 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -581,6 +581,47 @@ alter table btree_tall_tbl alter COLUMN t set storage plain; create index btree_tall_idx on btree_tall_tbl (t, id) with (fillfactor = 10); insert into btree_tall_tbl select g, repeat('x', 250) from generate_series(1, 130) g; +insert into btree_tall_tbl select g, NULL +from generate_series(50, 60) g; +-- +-- Test for skip scan with type that lacks skip support (text) +-- +set enable_seqscan to false; +set enable_bitmapscan to false; +-- Forwards scan +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t, id; + id +---- + 55 + 55 +(2 rows) + +explain (costs off) +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t, id; + QUERY PLAN +-------------------------------------------------------- + Index Only Scan using btree_tall_idx on btree_tall_tbl + Index Cond: (id = 55) +(2 rows) + +-- Backwards scan +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t DESC, id DESC; + id +---- + 55 + 55 +(2 rows) + +explain (costs off) +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t DESC, id DESC; + QUERY PLAN +----------------------------------------------------------------- + Index Only Scan Backward using btree_tall_idx on btree_tall_tbl + Index Cond: (id = 55) +(2 rows) + +reset enable_seqscan; +reset enable_bitmapscan; -- -- Test for multilevel page deletion -- diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 15be0043ad43..2cfb26699bef 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1637,7 +1637,9 @@ DROP TABLE syscol_table; -- Tests for IS NULL/IS NOT NULL with b-tree indexes -- CREATE TABLE onek_with_null AS SELECT unique1, unique2 FROM onek; -INSERT INTO onek_with_null (unique1,unique2) VALUES (NULL, -1), (NULL, NULL); +INSERT INTO onek_with_null(unique1, unique2) +VALUES (NULL, -1), (NULL, 2_147_483_647), (NULL, NULL), + (100, NULL), (500, NULL); CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2,unique1); SET enable_seqscan = OFF; SET enable_indexscan = ON; @@ -1645,7 +1647,7 @@ SET enable_bitmapscan = ON; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; count ------- - 2 + 3 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; @@ -1657,13 +1659,13 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; count ------- - 1000 + 1002 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; count ------- - 1 + 2 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; @@ -1678,12 +1680,18 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; 0 (1 row) +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; + unique1 | unique2 +---------+--------- + 500 | +(1 row) + DROP INDEX onek_nulltest; CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc,unique1); SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; count ------- - 2 + 3 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; @@ -1695,13 +1703,13 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; count ------- - 1000 + 1002 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; count ------- - 1 + 2 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; @@ -1722,12 +1730,18 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1 (1 row) +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; + unique1 | unique2 +---------+--------- + 500 | +(1 row) + DROP INDEX onek_nulltest; CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc nulls last,unique1); SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; count ------- - 2 + 3 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; @@ -1739,13 +1753,13 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; count ------- - 1000 + 1002 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; count ------- - 1 + 2 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; @@ -1760,12 +1774,18 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; 0 (1 row) +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; + unique1 | unique2 +---------+--------- + 500 | +(1 row) + DROP INDEX onek_nulltest; CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 nulls first,unique1); SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; count ------- - 2 + 3 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; @@ -1777,13 +1797,13 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; count ------- - 1000 + 1002 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; count ------- - 1 + 2 (1 row) SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; @@ -1798,6 +1818,12 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; 0 (1 row) +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; + unique1 | unique2 +---------+--------- + 500 | +(1 row) + DROP INDEX onek_nulltest; -- Check initial-positioning logic too CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2); @@ -1829,20 +1855,24 @@ SELECT unique1, unique2 FROM onek_with_null WHERE unique2 >= 0 (2 rows) SELECT unique1, unique2 FROM onek_with_null - ORDER BY unique2 DESC LIMIT 2; - unique1 | unique2 ----------+--------- - | - 278 | 999 -(2 rows) + ORDER BY unique2 DESC LIMIT 5; + unique1 | unique2 +---------+------------ + 500 | + 100 | + | + | 2147483647 + 278 | 999 +(5 rows) SELECT unique1, unique2 FROM onek_with_null WHERE unique2 >= -1 - ORDER BY unique2 DESC LIMIT 2; - unique1 | unique2 ----------+--------- - 278 | 999 - 0 | 998 -(2 rows) + ORDER BY unique2 DESC LIMIT 3; + unique1 | unique2 +---------+------------ + | 2147483647 + 278 | 999 + 0 | 998 +(3 rows) SELECT unique1, unique2 FROM onek_with_null WHERE unique2 < 999 ORDER BY unique2 DESC LIMIT 2; @@ -2247,7 +2277,8 @@ SELECT count(*) FROM dupindexcols (1 row) -- --- Check that index scans with =ANY indexquals return rows in index order +-- Check that index scans with SAOP array and/or skip array indexquals +-- return rows in index order -- explain (costs off) SELECT unique1 FROM tenk1 @@ -2269,7 +2300,7 @@ ORDER BY unique1; 42 (3 rows) --- Non-required array scan key on "tenthous": +-- Skip array on "thousand", SAOP array on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -2289,7 +2320,7 @@ ORDER BY thousand; 1 | 1001 (2 rows) --- Non-required array scan key on "tenthous", backward scan: +-- Skip array on "thousand", SAOP array on "tenthous", backward scan: explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -2309,6 +2340,25 @@ ORDER BY thousand DESC, tenthous DESC; 0 | 3000 (2 rows) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > 995 and tenthous in (998, 999) +ORDER BY thousand desc; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand > 995) AND (tenthous = ANY ('{998,999}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > 995 and tenthous in (998, 999) +ORDER BY thousand desc; + thousand | tenthous +----------+---------- + 999 | 999 + 998 | 998 +(2 rows) + -- -- Check elimination of redundant and contradictory index quals -- @@ -2339,6 +2389,45 @@ SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY(' --------- (0 rows) +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY(NULL); + QUERY PLAN +------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: (unique1 = ANY (NULL::integer[])) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY(NULL); + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{NULL,NULL,NULL}'); + QUERY PLAN +--------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: (unique1 = ANY ('{NULL,NULL,NULL}'::integer[])) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{NULL,NULL,NULL}'); + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IS NULL AND unique1 IS NULL; + QUERY PLAN +--------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 IS NULL) AND (unique1 IS NULL)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IS NULL AND unique1 IS NULL; + unique1 +--------- +(0 rows) + explain (costs off) SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; QUERY PLAN @@ -2462,6 +2551,44 @@ SELECT unique1 FROM tenk1 WHERE (thousand, tenthous) > (NULL, 5); --------- (0 rows) +-- Skip array redundancy (pair of redundant low_compare inequalities) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 and thousand >= 0 AND tenthous = 3000 +ORDER BY thousand; + QUERY PLAN +-------------------------------------------------------------------------------------- + Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand > '-1'::integer) AND (thousand >= 0) AND (tenthous = 3000)) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 and thousand >= 0 AND tenthous = 3000 +ORDER BY thousand; + thousand | tenthous +----------+---------- + 0 | 3000 +(1 row) + +-- Skip array redundancy (pair of redundant high_compare inequalities) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001 +ORDER BY thousand; + QUERY PLAN +-------------------------------------------------------------------------- + Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 3) AND (thousand <= 2) AND (tenthous = 1001)) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001 +ORDER BY thousand; + thousand | tenthous +----------+---------- + 1 | 1001 +(1 row) + -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index b1d12585eaed..cf48ae6d0c2e 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -5332,9 +5332,10 @@ Function | in_range(time without time zone,time without time zone,i btree | uuid_ops | uuid | uuid | 1 | uuid_cmp btree | uuid_ops | uuid | uuid | 2 | uuid_sortsupport btree | uuid_ops | uuid | uuid | 4 | btequalimage + btree | uuid_ops | uuid | uuid | 6 | uuid_skipsupport hash | uuid_ops | uuid | uuid | 1 | uuid_hash hash | uuid_ops | uuid | uuid | 2 | uuid_hash_extended -(5 rows) +(6 rows) -- check \dconfig set work_mem = 10240; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d310..5e20dc63337c 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -310,7 +310,7 @@ ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 6 < (int4, int2); -- ope ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 0 < (int4, int2); -- operator number should be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; @@ -444,6 +444,9 @@ ALTER OPERATOR FAMILY alt_opf18 USING btree ADD -- Should fail. Not allowed to have cross-type equalimage function. ALTER OPERATOR FAMILY alt_opf18 USING btree ADD FUNCTION 4 (int4, int2) btequalimage(oid); +-- Should fail. Not allowed to have cross-type skip support function. +ALTER OPERATOR FAMILY alt_opf18 USING btree + ADD FUNCTION 6 (int4, int2) btint4skipsupport(internal); ALTER OPERATOR FAMILY alt_opf18 USING btree DROP FUNCTION 2 (int4, int4); DROP OPERATOR FAMILY alt_opf18 USING btree; diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 670ad5c6e6a5..68c61dbc7d19 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -327,6 +327,27 @@ alter table btree_tall_tbl alter COLUMN t set storage plain; create index btree_tall_idx on btree_tall_tbl (t, id) with (fillfactor = 10); insert into btree_tall_tbl select g, repeat('x', 250) from generate_series(1, 130) g; +insert into btree_tall_tbl select g, NULL +from generate_series(50, 60) g; + +-- +-- Test for skip scan with type that lacks skip support (text) +-- +set enable_seqscan to false; +set enable_bitmapscan to false; + +-- Forwards scan +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t, id; +explain (costs off) +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t, id; + +-- Backwards scan +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t DESC, id DESC; +explain (costs off) +SELECT id FROM btree_tall_tbl WHERE id = 55 ORDER BY t DESC, id DESC; + +reset enable_seqscan; +reset enable_bitmapscan; -- -- Test for multilevel page deletion diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 6b3852dddd80..cd90b1c3a8f2 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -650,7 +650,9 @@ DROP TABLE syscol_table; -- CREATE TABLE onek_with_null AS SELECT unique1, unique2 FROM onek; -INSERT INTO onek_with_null (unique1,unique2) VALUES (NULL, -1), (NULL, NULL); +INSERT INTO onek_with_null(unique1, unique2) +VALUES (NULL, -1), (NULL, 2_147_483_647), (NULL, NULL), + (100, NULL), (500, NULL); CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2,unique1); SET enable_seqscan = OFF; @@ -663,6 +665,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; DROP INDEX onek_nulltest; @@ -675,6 +678,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NUL SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1); +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; DROP INDEX onek_nulltest; @@ -686,6 +690,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; DROP INDEX onek_nulltest; @@ -697,6 +702,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; +SELECT unique1, unique2 FROM onek_with_null WHERE unique1 = 500 ORDER BY unique2 DESC, unique1 DESC LIMIT 1; DROP INDEX onek_nulltest; @@ -716,9 +722,9 @@ SELECT unique1, unique2 FROM onek_with_null WHERE unique2 >= 0 ORDER BY unique2 LIMIT 2; SELECT unique1, unique2 FROM onek_with_null - ORDER BY unique2 DESC LIMIT 2; + ORDER BY unique2 DESC LIMIT 5; SELECT unique1, unique2 FROM onek_with_null WHERE unique2 >= -1 - ORDER BY unique2 DESC LIMIT 2; + ORDER BY unique2 DESC LIMIT 3; SELECT unique1, unique2 FROM onek_with_null WHERE unique2 < 999 ORDER BY unique2 DESC LIMIT 2; @@ -852,7 +858,8 @@ SELECT count(*) FROM dupindexcols WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX'; -- --- Check that index scans with =ANY indexquals return rows in index order +-- Check that index scans with SAOP array and/or skip array indexquals +-- return rows in index order -- explain (costs off) @@ -864,7 +871,7 @@ SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; --- Non-required array scan key on "tenthous": +-- Skip array on "thousand", SAOP array on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -874,7 +881,7 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; --- Non-required array scan key on "tenthous", backward scan: +-- Skip array on "thousand", SAOP array on "tenthous", backward scan: explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -884,6 +891,15 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand DESC, tenthous DESC; +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > 995 and tenthous in (998, 999) +ORDER BY thousand desc; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > 995 and tenthous in (998, 999) +ORDER BY thousand desc; + -- -- Check elimination of redundant and contradictory index quals -- @@ -897,6 +913,21 @@ SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY(' SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY(NULL); + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY(NULL); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{NULL,NULL,NULL}'); + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{NULL,NULL,NULL}'); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IS NULL AND unique1 IS NULL; + +SELECT unique1 FROM tenk1 WHERE unique1 IS NULL AND unique1 IS NULL; + explain (costs off) SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; @@ -942,6 +973,26 @@ SELECT unique1 FROM tenk1 WHERE (thousand, tenthous) > (NULL, 5); SELECT unique1 FROM tenk1 WHERE (thousand, tenthous) > (NULL, 5); +-- Skip array redundancy (pair of redundant low_compare inequalities) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 and thousand >= 0 AND tenthous = 3000 +ORDER BY thousand; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 and thousand >= 0 AND tenthous = 3000 +ORDER BY thousand; + +-- Skip array redundancy (pair of redundant high_compare inequalities) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001 +ORDER BY thousand; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001 +ORDER BY thousand; + -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 8f28d8ff28eb..17c824b24c40 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -225,6 +225,7 @@ BTScanPos BTScanPosData BTScanPosItem BTShared +BTSkipPreproc BTSortArrayContext BTSpool BTStack @@ -2752,6 +2753,8 @@ SimpleStringListCell SingleBoundSortItem Size SkipPages +SkipSupport +SkipSupportData SlabBlock SlabContext SlabSlot From 6113e728e5eaab1247b255caf85ba48f112818db Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 16 Nov 2024 15:58:41 -0500 Subject: [PATCH 2/5] Enhance nbtree tuple scan key optimizations. Postgres 17 commit e0b1ee17 added two closely related nbtree optimizations: the "prechecked" and "firstpage" optimizations. Both optimizations avoided needlessly evaluating keys that are guaranteed to be satisfied by applying page-level context. These optimizations were adapted to work with the nbtree ScalarArrayOp execution patch a few months later, which became commit 5bf748b8. The "prechecked" design had a number of notable weak points. It didn't account for the fact that an = array scan key's sk_argument field might need to advance at the point of the page precheck (it didn't check the precheck tuple against the key's array, only the key's sk_argument, which needlessly made it ineffective in corner cases involving stepping to a page having advanced the scan's arrays using a truncated high key). It was also an "all or nothing" optimization: either it was completely effective (skipping all required-in-scan-direction keys against all attributes) for the whole page, or it didn't work at all. This also implied that it couldn't be used on pages where the scan had to terminate before reaching the end of the page due to an unsatisfied low-order key setting continuescan=false. Replace both optimizations with a new optimization without any of these weak points. This works by giving affected _bt_readpage calls a scankey offset that its _bt_checkkeys calls start at (an offset to the first key that might not be satisfied by every non-pivot tuple from the page). The new optimization is activated at the same point as the previous "prechecked" optimization (at the start of a _bt_readpage of any page after the scan's first). The old "prechecked" optimization worked off of the highest non-pivot tuple on the page (or the lowest, when scanning backwards), but the new "startikey" optimization always works off of a pair of non-pivot tuples (the lowest and the highest, taken together). This approach allows the "startikey" optimization to bypass = array key comparisons whenever they're satisfied by _some_ element (not necessarily the current one). This is useful for SAOP array keys (it fixes the issue with truncated high keys), and is needed to get the most out of range skip array keys (we expect to be able to bypass range skip array = keys when a range of values on the page all satisfy the key, even when there are multiple values, provided they all "satisfy some range skip array element"). Although this is independently useful work, the main motivation is to fix regressions in index scans that are nominally eligible to use skip scan, but can never actually benefit from skipping. These are cases where a leading prefix column contains many distinct values, especially when the number of values approaches the total number of index tuples, where skipping can never be profitable. The CPU costs of skip array maintenance is by far the main cost that must be kept under control. Skip scan's approach of adding skip arrays during preprocessing and then fixing (or significantly ameliorating) the resulting regressions seen in unsympathetic cases is enabled by the optimization added by this commit (and by the "look ahead" optimization introduced by commit 5bf748b8). This allows the planner to avoid generating distinct, competing index paths (one path for skip scan, another for an equivalent traditional full index scan). The overall effect is to make scan runtime close to optimal, even when the planner works off an incorrect cardinality estimate. Scans will also perform well given a skipped column with data skew: individual groups of pages with many distinct values in respect of a skipped column can be read about as efficiently as before, without having to give up on skipping over other provably-irrelevant leaf pages. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Reviewed-By: Masahiro Ikeda Reviewed-By: Matthias van de Meent Discussion: https://postgr.es/m/CAH2-Wz=Y93jf5WjoOsN=xvqpMjRy-bxCE037bVFi-EasrpeUJA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WznWDK45JfNPNvDxh6RQy-TaCwULaM5u5ALMXbjLBMcugQ@mail.gmail.com --- src/backend/access/nbtree/nbtpreprocesskeys.c | 1 + src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/nbtree/nbtsearch.c | 77 +-- src/backend/access/nbtree/nbtutils.c | 541 ++++++++++++++---- src/include/access/nbtree.h | 11 +- 5 files changed, 478 insertions(+), 153 deletions(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 5c08cda25a7d..339092dfa678 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1389,6 +1389,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ + so->skipScan = (numSkipArrayKeys > 0); so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index bdadbf73cd91..325804ae707a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -349,6 +349,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) else so->keyData = NULL; + so->skipScan = false; so->needPrimScan = false; so->scanBehind = false; so->oppositeDirCheck = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 1ef2cb2b55ed..e95c396d23b2 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1648,47 +1648,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.finaltup = NULL; pstate.page = page; pstate.firstpage = firstpage; + pstate.forcenonrequired = false; + pstate.startikey = 0; pstate.offnum = InvalidOffsetNumber; pstate.skip = InvalidOffsetNumber; pstate.continuescan = true; /* default assumption */ - pstate.prechecked = false; - pstate.firstmatch = false; pstate.rechecks = 0; pstate.targetdistance = 0; - /* - * Prechecking the value of the continuescan flag for the last item on the - * page (for backwards scan it will be the first item on a page). If we - * observe it to be true, then it should be true for all other items. This - * allows us to do significant optimizations in the _bt_checkkeys() - * function for all the items on the page. - * - * With the forward scan, we do this check for the last item on the page - * instead of the high key. It's relatively likely that the most - * significant column in the high key will be different from the - * corresponding value from the last item on the page. So checking with - * the last item on the page would give a more precise answer. - * - * We skip this for the first page read by each (primitive) scan, to avoid - * slowing down point queries. They typically don't stand to gain much - * when the optimization can be applied, and are more likely to notice the - * overhead of the precheck. Also avoid it during scans with array keys, - * which might be using skip scan (XXX fixed in next commit). - */ - if (!pstate.firstpage && !arrayKeys && minoff < maxoff) - { - ItemId iid; - IndexTuple itup; - - iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff); - itup = (IndexTuple) PageGetItem(page, iid); - - /* Call with arrayKeys=false to avoid undesirable side-effects */ - _bt_checkkeys(scan, &pstate, false, itup, indnatts); - pstate.prechecked = pstate.continuescan; - pstate.continuescan = true; /* reset */ - } - if (ScanDirectionIsForward(dir)) { /* SK_SEARCHARRAY forward scans must provide high key up front */ @@ -1716,6 +1683,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->scanBehind = so->oppositeDirCheck = false; /* reset */ } + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + /* load items[] in ascending order */ itemIndex = 0; @@ -1752,6 +1726,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { Assert(!passes_quals && pstate.continuescan); Assert(offnum < pstate.skip); + Assert(!pstate.forcenonrequired); offnum = pstate.skip; pstate.skip = InvalidOffsetNumber; @@ -1761,7 +1736,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (passes_quals) { /* tuple passes all scan key conditions */ - pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1816,7 +1790,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, int truncatt; truncatt = BTreeTupleGetNAtts(itup, rel); - pstate.prechecked = false; /* precheck didn't cover HIKEY */ + pstate.forcenonrequired = false; + pstate.startikey = 0; _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } @@ -1855,6 +1830,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->scanBehind = so->oppositeDirCheck = false; /* reset */ } + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1894,6 +1876,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, Assert(!BTreeTupleIsPivot(itup)); pstate.offnum = offnum; + if (arrayKeys && offnum == minoff && pstate.forcenonrequired) + { + pstate.forcenonrequired = false; + pstate.startikey = 0; + } passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); @@ -1905,6 +1892,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { Assert(!passes_quals && pstate.continuescan); Assert(offnum > pstate.skip); + Assert(!pstate.forcenonrequired); offnum = pstate.skip; pstate.skip = InvalidOffsetNumber; @@ -1914,7 +1902,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ - pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1970,6 +1957,20 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; } + /* + * If _bt_set_startikey told us to temporarily treat the scan's keys as + * nonrequired (possible only during scans with array keys), there must be + * no lasting consequences for the scan's array keys. The scan's arrays + * should now have exactly the same elements as they would have had if the + * nonrequired behavior had never been used. (In general, a scan's arrays + * are expected to track its progress through the index's key space.) + * + * We are required (by _bt_set_startikey) to call _bt_checkkeys against + * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's + * arrays to recover. Assert that that step hasn't been missed. + */ + Assert(!pstate.forcenonrequired); + return (so->currPos.firstItem <= so->currPos.lastItem); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 108030a8ee7c..ea5b3b6885f5 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -57,11 +57,11 @@ static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool forcenonrequired, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); + ScanDirection dir, bool forcenonrequired, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -1422,9 +1422,10 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) * postcondition's <= operator with a >=. In other words, just swap the * precondition with the postcondition.) * - * We also deal with "advancing" non-required arrays here. Callers whose - * sktrig scan key is non-required specify sktrig_required=false. These calls - * are the only exception to the general rule about always advancing the + * We also deal with "advancing" non-required arrays here (or arrays that are + * treated as non-required for the duration of a _bt_readpage call). Callers + * whose sktrig scan key is non-required specify sktrig_required=false. These + * calls are the only exception to the general rule about always advancing the * required array keys (the scan may not even have a required array). These * callers should just pass a NULL pstate (since there is never any question * of stopping the scan). No call to _bt_tuple_before_array_skeys is required @@ -1464,6 +1465,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, all_satisfied = true; Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); + Assert(_bt_verify_keys_with_arraykeys(scan)); if (sktrig_required) { @@ -1473,17 +1475,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL)); - /* - * Required scan key wasn't satisfied, so required arrays will have to - * advance. Invalidate page-level state that tracks whether the - * scan's required-in-opposite-direction-only keys are known to be - * satisfied by page's remaining tuples. - */ - pstate->firstmatch = false; - - /* Shouldn't have to invalidate 'prechecked', though */ - Assert(!pstate->prechecked); - /* * Once we return we'll have a new set of required array keys, so * reset state used by "look ahead" optimization @@ -1491,8 +1482,26 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, pstate->rechecks = 0; pstate->targetdistance = 0; } + else if (sktrig < so->numberOfKeys - 1 && + !(so->keyData[so->numberOfKeys - 1].sk_flags & SK_SEARCHARRAY)) + { + int least_sign_ikey = so->numberOfKeys - 1; + bool continuescan; - Assert(_bt_verify_keys_with_arraykeys(scan)); + /* + * Optimization: perform a precheck of the least significant key + * during !sktrig_required calls when it isn't already our sktrig + * (provided the precheck key is not itself an array). + * + * When the precheck works out we'll avoid an expensive binary search + * of sktrig's array (plus any other arrays before least_sign_ikey). + */ + Assert(so->keyData[sktrig].sk_flags & SK_SEARCHARRAY); + if (!_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + false, &continuescan, + &least_sign_ikey)) + return false; + } for (int ikey = 0; ikey < so->numberOfKeys; ikey++) { @@ -1534,8 +1543,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) { - Assert(sktrig_required); - required = true; if (cur->sk_attno > tupnatts) @@ -1669,7 +1676,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } else { - Assert(sktrig_required && required); + Assert(required); /* * This is a required non-array equality strategy scan key, which @@ -1711,7 +1718,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * be eliminated by _bt_preprocess_keys. It won't matter if some of * our "true" array scan keys (or even all of them) are non-required. */ - if (required && + if (sktrig_required && required && ((ScanDirectionIsForward(dir) && result > 0) || (ScanDirectionIsBackward(dir) && result < 0))) beyond_end_advance = true; @@ -1726,7 +1733,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * array scan keys are considered interesting.) */ all_satisfied = false; - if (required) + if (sktrig_required && required) all_required_satisfied = false; else { @@ -1786,6 +1793,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * of any required scan key). All that matters is whether caller's tuple * satisfies the new qual, so it's safe to just skip the _bt_check_compare * recheck when we've already determined that it can only return 'false'. + * + * Note: In practice most scan keys are marked required by preprocessing, + * if necessary by generating a preceding skip array. We nevertheless + * often handle array keys marked required as if they were nonrequired. + * This behavior is requested by our _bt_check_compare caller, though only + * when it is passed "forcenonrequired=true" by _bt_checkkeys. */ if ((sktrig_required && all_required_satisfied) || (!sktrig_required && all_satisfied)) @@ -1796,9 +1809,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, Assert(all_required_satisfied); /* Recheck _bt_check_compare on behalf of caller */ - if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, - &continuescan, &nsktrig) && + if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + false, &continuescan, + &nsktrig) && !so->scanBehind) { /* This tuple satisfies the new qual */ @@ -2042,8 +2055,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * read at least one leaf page before the one we're reading now. This * makes primscan scheduling more efficient when scanning subsets of an * index with many distinct attribute values matching many array elements. - * It encourages fewer, larger primitive scans where that makes sense - * (where index descent costs need to be kept under control). + * It encourages fewer, larger primitive scans where that makes sense. + * This will in turn encourage _bt_readpage to apply the pstate.startikey + * optimization more often. * * Note: This heuristic isn't as aggressive as you might think. We're * conservative about allowing a primitive scan to step from the first @@ -2200,17 +2214,14 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) * the page to the right. * * Advances the scan's array keys when necessary for arrayKeys=true callers. - * Caller can avoid all array related side-effects when calling just to do a - * page continuescan precheck -- pass arrayKeys=false for that. Scans without - * any arrays keys must always pass arrayKeys=false. + * Scans without any array keys must always pass arrayKeys=false. * * Also stops and starts primitive index scans for arrayKeys=true callers. * Scans with array keys are required to set up page state that helps us with * this. The page's finaltup tuple (the page high key for a forward scan, or * the page's first non-pivot tuple for a backward scan) must be set in - * pstate.finaltup ahead of the first call here for the page (or possibly the - * first call after an initial continuescan-setting page precheck call). Set - * this to NULL for rightmost page (or the leftmost page for backwards scans). + * pstate.finaltup ahead of the first call here for the page. Set this to + * NULL for rightmost page (or the leftmost page for backwards scans). * * scan: index scan descriptor (containing a search-type scankey) * pstate: page level input and output parameters @@ -2225,42 +2236,48 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); BTScanOpaque so = (BTScanOpaque) scan->opaque; ScanDirection dir = so->currPos.dir; - int ikey = 0; + int ikey = pstate->startikey; bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); + Assert(arrayKeys || so->numArrayKeys == 0); - res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - arrayKeys, pstate->prechecked, pstate->firstmatch, - &pstate->continuescan, &ikey); + res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, + pstate->forcenonrequired, &pstate->continuescan, + &ikey); + /* + * If _bt_check_compare relied on the pstate.startikey optimization, call + * again (in assert-enabled builds) to verify it didn't affect our answer. + * + * Note: we can't do this when !pstate.forcenonrequired, since any arrays + * before pstate.startikey won't have advanced on this page at all. + */ + Assert(!pstate->forcenonrequired || arrayKeys); #ifdef USE_ASSERT_CHECKING - if (!arrayKeys && so->numArrayKeys) + if (pstate->startikey > 0 && !pstate->forcenonrequired) { - /* - * This is a continuescan precheck call for a scan with array keys. - * - * Assert that the scan isn't in danger of becoming confused. - */ - Assert(!so->scanBehind && !so->oppositeDirCheck); - Assert(!pstate->prechecked && !pstate->firstmatch); - Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, - tupnatts, false, 0, NULL)); - } - if (pstate->prechecked || pstate->firstmatch) - { - bool dcontinuescan; + bool dres, + dcontinuescan; int dikey = 0; + /* Pass arrayKeys=false to avoid array side-effects */ + dres = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, + pstate->forcenonrequired, &dcontinuescan, + &dikey); + Assert(res == dres); + Assert(pstate->continuescan == dcontinuescan); + /* - * Call relied on continuescan/firstmatch prechecks -- assert that we - * get the same answer without those optimizations + * Should also get the same ikey result. We need a slightly weaker + * assertion during arrayKeys calls, since they might be using an + * array that couldn't be marked required during preprocessing + * (preprocessing occasionally fails to add a "bridging" skip array, + * due to implementation restrictions around RowCompare keys). */ - Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, - &dcontinuescan, &dikey)); - Assert(pstate->continuescan == dcontinuescan); + Assert(arrayKeys || ikey == dikey); + Assert(ikey <= dikey); } #endif @@ -2281,6 +2298,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * It's also possible that the scan is still _before_ the _start_ of * tuples matching the current set of array keys. Check for that first. */ + Assert(!pstate->forcenonrequired); if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, ikey, NULL)) { @@ -2394,8 +2412,9 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, Assert(so->numArrayKeys); - _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, - false, false, false, &continuescan, &ikey); + _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, false, + false, &continuescan, + &ikey); if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) return false; @@ -2403,6 +2422,294 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, return true; } +/* + * Determines an offset to the first scan key (an so->keyData[]-wise offset) + * that is _not_ guaranteed to be satisfied by every tuple from pstate.page, + * which is set in pstate.startikey for _bt_checkkeys calls for the page. + * This allows caller to save cycles on comparisons of a prefix of keys while + * reading pstate.page. + * + * Also determines if later calls to _bt_checkkeys (for pstate.page) should be + * forced to treat all required scan keys >= pstate.startikey as nonrequired + * (that is, if they're to be treated as if any SK_BT_REQFWD/SK_BT_REQBKWD + * markings that were set by preprocessing were not set at all, for the + * duration of _bt_checkkeys calls prior to the call for pstate.finaltup). + * This is indicated to caller by setting pstate.forcenonrequired. + * + * Call here at the start of reading a leaf page beyond the first one for the + * primitive index scan. We consider all non-pivot tuples, so it doesn't make + * sense to call here when only a subset of those tuples can ever be read. + * This is also a good idea on performance grounds; not calling here when on + * the first page (first for the current primitive scan) avoids wasting cycles + * during selective point queries. They typically don't stand to gain as much + * when we can set pstate.startikey, and are likely to notice the overhead of + * calling here. + * + * Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys + * call for pstate.finaltup iff we set forcenonrequired=true. This will give + * _bt_checkkeys the opportunity to call _bt_advance_array_keys once more, + * with sktrig_required=true, to advance the arrays that were ignored during + * checks of all of the page's prior tuples. Caller doesn't need to do this + * on the rightmost/leftmost page in the index (where pstate.finaltup isn't + * set), since forcenonrequired won't be set here by us in the first place. + */ +void +_bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + ItemId iid; + IndexTuple firsttup, + lasttup; + int startikey = 0, + arrayidx = 0, + firstchangingattnum; + bool start_past_saop_eq = false; + + Assert(!so->scanBehind); + Assert(pstate->minoff < pstate->maxoff); + Assert(!pstate->firstpage); + Assert(pstate->startikey == 0); + + if (so->numberOfKeys == 0) + return; + + /* minoff is an offset to the lowest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->minoff); + firsttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* maxoff is an offset to the highest non-pivot tuple on the page */ + iid = PageGetItemId(pstate->page, pstate->maxoff); + lasttup = (IndexTuple) PageGetItem(pstate->page, iid); + + /* Determine the first attribute whose values change on caller's page */ + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + + for (; startikey < so->numberOfKeys; startikey++) + { + ScanKey key = so->keyData + startikey; + BTArrayKeyInfo *array; + Datum firstdatum, + lastdatum; + bool firstnull, + lastnull; + int32 result; + + /* + * Determine if it's safe to set pstate.startikey to an offset to a + * key that comes after this key, by examining this key + */ + if (unlikely(!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))) + { + /* Scan key isn't marked required (corner case) */ + Assert(!(key->sk_flags & SK_ROW_HEADER)); + break; /* unsafe */ + } + if (key->sk_flags & SK_ROW_HEADER) + { + /* + * Can't let pstate.startikey get set to an ikey beyond a + * RowCompare inequality + */ + break; /* unsafe */ + } + if (key->sk_strategy != BTEqualStrategyNumber) + { + /* + * Scalar inequality key. + * + * It's definitely safe for _bt_checkkeys to avoid assessing this + * inequality when the page's first and last non-pivot tuples both + * satisfy the inequality (since the same must also be true of all + * the tuples in between these two). + * + * Unlike the "=" case, it doesn't matter if this attribute has + * more than one distinct value (though it _is_ necessary for any + * and all _prior_ attributes to contain no more than one distinct + * value amongst all of the tuples from pstate.page). + */ + if (key->sk_attno > firstchangingattnum) /* >, not >= */ + break; /* unsafe, preceding attr has multiple + * distinct values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); + lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); + + if (key->sk_flags & SK_ISNULL) + { + /* IS NOT NULL key */ + Assert(key->sk_flags & SK_SEARCHNOTNULL); + + if (firstnull || lastnull) + break; /* unsafe */ + + /* Safe, IS NOT NULL key satisfied by every tuple */ + continue; + } + + /* Test firsttup */ + if (firstnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, firstdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Test lasttup */ + if (lastnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, lastdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Safe, scalar inequality satisfied by every tuple */ + continue; + } + + /* Some = key (could be a a scalar = key, could be an array = key) */ + Assert(key->sk_strategy == BTEqualStrategyNumber); + + if (!(key->sk_flags & SK_SEARCHARRAY)) + { + /* + * Scalar = key (posibly an IS NULL key). + * + * It is unsafe to set pstate.startikey to an ikey beyond this + * key, unless the = key is satisfied by every possible tuple on + * the page (possible only when attribute has just one distinct + * value among all tuples on the page). + */ + if (key->sk_attno >= firstchangingattnum) + break; /* unsafe, multiple distinct attr values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &firstnull); + if (key->sk_flags & SK_ISNULL) + { + /* IS NULL key */ + Assert(key->sk_flags & SK_SEARCHNULL); + + if (!firstnull) + break; /* unsafe */ + + /* Safe, IS NULL key satisfied by every tuple */ + continue; + } + if (firstnull || + !DatumGetBool(FunctionCall2Coll(&key->sk_func, + key->sk_collation, firstdatum, + key->sk_argument))) + break; /* unsafe */ + + /* Safe, scalar = key satisfied by every tuple */ + continue; + } + + /* = array key (could be a SAOP array, could be a skip array) */ + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == startikey); + if (array->num_elems != -1) + { + /* + * SAOP array = key. + * + * Handle this like we handle scalar = keys (though binary search + * for a matching element, to avoid relying on key's sk_argument). + */ + if (key->sk_attno >= firstchangingattnum) + break; /* unsafe, multiple distinct attr values */ + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, + &firstnull); + _bt_binsrch_array_skey(&so->orderProcs[startikey], + false, NoMovementScanDirection, + firstdatum, firstnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Safe, SAOP = key satisfied by every tuple */ + start_past_saop_eq = true; + continue; + } + + /* + * Skip array = key. + * + * Handle this like we handle scalar inequality keys (but avoid using + * key's sk_argument/advancing array, as in the SAOP array case). + */ + if (array->null_elem) + { + /* + * Safe, non-range skip array "satisfied" by every tuple on page + * (safe even when "key->sk_attno <= firstchangingattnum") + */ + continue; + } + else if (key->sk_attno > firstchangingattnum) /* >, not >= */ + { + break; /* unsafe, preceding attr has multiple + * distinct values */ + } + + firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); + lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); + + /* Test firsttup */ + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + firstdatum, firstnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Test lasttup */ + _bt_binsrch_skiparray_skey(false, ForwardScanDirection, + lastdatum, lastnull, array, key, + &result); + if (result != 0) + break; /* unsafe */ + + /* Safe, range skip array satisfied by every tuple */ + } + + /* + * Use of forcenonrequired is typically undesirable, since it'll force + * _bt_readpage caller to read every tuple on the page -- even though, in + * general, it might well be possible to end the scan on an earlier tuple. + * However, caller must use forcenonrequired when start_past_saop_eq=true, + * since the usual required array behavior might fail to roll over to the + * SAOP array. This is no loss, since it can only happen when reading + * pages that must have all their tuples read either way. + * + * We always prefer forcenonrequired=true during scans with skip arrays + * (except on the first page of each primitive index scan), though -- even + * when "startikey == 0". That way, _bt_advance_array_keys's low-order + * key precheck optimization can always be used (unless on the first page + * of the scan). It seems slightly preferable to check more tuples when + * that allows us to do significantly less skip array maintenance. + */ + pstate->forcenonrequired = (start_past_saop_eq || so->skipScan); + pstate->startikey = startikey; + + /* + * _bt_readpage caller is required to call _bt_checkkeys against page's + * finaltup with forcenonrequired=false whenever we initially set + * forcenonrequired=true. That way the scan's arrays will reliably track + * its progress through the index's key space. + * + * We don't expect this when _bt_readpage caller has no finaltup due to + * its page being the rightmost (or the leftmost, during backwards scans). + * When we see that _bt_readpage has no finaltup, back out of everything. + */ + Assert(!pstate->forcenonrequired || so->numArrayKeys); + if (pstate->forcenonrequired && !pstate->finaltup) + { + pstate->forcenonrequired = false; + pstate->startikey = 0; + } +} + /* * Test whether an indextuple satisfies current scan condition. * @@ -2432,23 +2739,33 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, * by the current array key, or if they're truly unsatisfied (that is, if * they're unsatisfied by every possible array key). * - * Though we advance non-required array keys on our own, that shouldn't have - * any lasting consequences for the scan. By definition, non-required arrays - * have no fixed relationship with the scan's progress. (There are delicate - * considerations for non-required arrays when the arrays need to be advanced - * following our setting continuescan to false, but that doesn't concern us.) - * * Pass advancenonrequired=false to avoid all array related side effects. * This allows _bt_advance_array_keys caller to avoid infinite recursion. + * + * Pass forcenonrequired=true to instruct us to treat all keys as nonrequired. + * This is used to make it safe to temporarily stop properly maintaining the + * scan's required arrays. _bt_checkkeys caller (_bt_readpage, actually) + * determines a prefix of keys that must satisfy every possible corresponding + * index attribute value from its page, which is passed to us via *ikey arg + * (this is the first key that might be unsatisfied by tuples on the page). + * Obviously, we won't maintain any array keys from before *ikey, so it's + * quite possible for such arrays to "fall behind" the index's keyspace. + * Caller will need to "catch up" by passing forcenonrequired=true (alongside + * an *ikey=0) once the page's finaltup is reached. + * + * Note: it's safe to pass an *ikey > 0 with forcenonrequired=false, but only + * when caller determines that it won't affect array maintenance. */ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool forcenonrequired, bool *continuescan, int *ikey) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + Assert(!forcenonrequired || advancenonrequired); + *continuescan = true; /* default assumption */ for (; *ikey < so->numberOfKeys; (*ikey)++) @@ -2461,36 +2778,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* * Check if the key is required in the current scan direction, in the - * opposite scan direction _only_, or in neither direction + * opposite scan direction _only_, or in neither direction (except + * when we're forced to treat all scan keys as nonrequired) */ - if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) requiredOppositeDirOnly = true; - /* - * If the caller told us the *continuescan flag is known to be true - * for the last item on the page, then we know the keys required for - * the current direction scan should be matched. Otherwise, the - * *continuescan flag would be set for the current item and - * subsequently the last item on the page accordingly. - * - * If the key is required for the opposite direction scan, we can skip - * the check if the caller tells us there was already at least one - * matching item on the page. Also, we require the *continuescan flag - * to be true for the last item on the page to know there are no - * NULLs. - * - * Both cases above work except for the row keys, where NULLs could be - * found in the middle of matching values. - */ - if (prechecked && - (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) && - !(key->sk_flags & SK_ROW_HEADER)) - continue; - if (key->sk_attno > tupnatts) { /* @@ -2512,6 +2813,16 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, { Assert(key->sk_flags & SK_SEARCHARRAY); Assert(key->sk_flags & SK_BT_SKIP); + Assert(requiredSameDir || forcenonrequired); + + /* + * Cannot fall back on _bt_tuple_before_array_skeys when we're + * treating the scan's keys as nonrequired, though. Just handle + * this like any other non-required equality-type array key. + */ + if (forcenonrequired) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); *continuescan = false; return false; @@ -2521,7 +2832,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) + forcenonrequired, continuescan)) continue; return false; } @@ -2554,9 +2865,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ if (requiredSameDir) *continuescan = false; + else if (unlikely(key->sk_flags & SK_BT_SKIP)) + { + /* + * If we're treating scan keys as nonrequired, and encounter a + * skip array scan key whose current element is NULL, then it + * must be a non-range skip array + */ + Assert(forcenonrequired && *ikey > 0); + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + } /* - * In any case, this indextuple doesn't match the qual. + * This indextuple doesn't match the qual. */ return false; } @@ -2577,7 +2899,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * forward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -2595,7 +2917,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * backward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -2606,15 +2928,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, return false; } - /* - * Apply the key-checking function, though only if we must. - * - * When a key is required in the opposite-of-scan direction _only_, - * then it must already be satisfied if firstmatch=true indicates that - * an earlier tuple from this same page satisfied it earlier on. - */ - if (!(requiredOppositeDirOnly && firstmatch) && - !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, + if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, datum, key->sk_argument))) { /* @@ -2664,7 +2978,8 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, + bool forcenonrequired, bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -2704,7 +3019,11 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { - if (subkey->sk_flags & SK_BT_NULLS_FIRST) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have @@ -2758,8 +3077,12 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, */ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); subkey--; - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) @@ -2811,7 +3134,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, break; } - if (!result) + if (!result && !forcenonrequired) { /* * Tuple fails this qual. If it's a required qual for the current @@ -2855,6 +3178,8 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, OffsetNumber aheadoffnum; IndexTuple ahead; + Assert(!pstate->forcenonrequired); + /* Avoid looking ahead when comparing the page high key */ if (pstate->offnum < pstate->minoff) return; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index b86bf7bf37e0..c8708f2fdc23 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1059,6 +1059,7 @@ typedef struct BTScanOpaqueData /* workspace for SK_SEARCHARRAY support */ int numArrayKeys; /* number of equality-type array keys */ + bool skipScan; /* At least one skip array in arrayKeys[]? */ bool needPrimScan; /* New prim scan to continue in current dir? */ bool scanBehind; /* Check scan not still behind on next page? */ bool oppositeDirCheck; /* scanBehind opposite-scan-dir check? */ @@ -1105,6 +1106,8 @@ typedef struct BTReadPageState IndexTuple finaltup; /* Needed by scans with array keys */ Page page; /* Page being read */ bool firstpage; /* page is first for primitive scan? */ + bool forcenonrequired; /* treat all keys as nonrequired? */ + int startikey; /* start comparisons from this scan key */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ OffsetNumber offnum; /* current tuple's page offset number */ @@ -1113,13 +1116,6 @@ typedef struct BTReadPageState OffsetNumber skip; /* Array keys "look ahead" skip offnum */ bool continuescan; /* Terminate ongoing (primitive) index scan? */ - /* - * Input and output parameters, set and unset by both _bt_readpage and - * _bt_checkkeys to manage precheck optimizations - */ - bool prechecked; /* precheck set continuescan to 'true'? */ - bool firstmatch; /* at least one match so far? */ - /* * Private _bt_checkkeys state used to manage "look ahead" optimization * (only used during scans with array keys) @@ -1327,6 +1323,7 @@ extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arra IndexTuple tuple, int tupnatts); extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); +extern void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); From a62d29e19fbf2716cb3e03aec56c82eb8452c60f Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 26 Mar 2025 18:21:27 -0400 Subject: [PATCH 3/5] Improve skip scan primitive scan scheduling. Fixes a few remaining cases where affected skip scans never quite manage to reach the point of being able to apply the "passed first page" heuristic added by commit 9a2e2a28. They only need to manage to get there once to converge on full index scan behavior, but it was still possible for that to never happen, with the wrong workload. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wz=RVdG3zWytFWBsyW7fWH7zveFvTHed5JKEsuTT0RCO_A@mail.gmail.com --- src/backend/access/nbtree/nbtsearch.c | 16 +++++ src/backend/access/nbtree/nbtutils.c | 90 ++++++++++++++++++--------- src/include/access/nbtree.h | 3 +- 3 files changed, 78 insertions(+), 31 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index e95c396d23b2..a653b8d2fa52 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1655,6 +1655,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.continuescan = true; /* default assumption */ pstate.rechecks = 0; pstate.targetdistance = 0; + pstate.nskipadvances = 0; if (ScanDirectionIsForward(dir)) { @@ -1884,6 +1885,21 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); + if (arrayKeys && so->scanBehind) + { + /* + * Done scanning this page, but not done with the current + * primscan. + * + * Note: Forward scans don't check this explicitly, since they + * prefer to reuse pstate.skip for this instead. + */ + Assert(!passes_quals && pstate.continuescan); + Assert(!pstate.forcenonrequired); + + break; + } + /* * Check if we need to skip ahead to a later tuple (only possible * when the scan uses array keys) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index ea5b3b6885f5..2d060e185ee6 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -26,6 +26,7 @@ #define LOOK_AHEAD_REQUIRED_RECHECKS 3 #define LOOK_AHEAD_DEFAULT_DISTANCE 5 +#define NSKIPADVANCES_THRESHOLD 3 static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, @@ -41,7 +42,8 @@ static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, bool low_not_high); static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); -static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, + bool *skip_array_set); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, @@ -970,7 +972,8 @@ _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) * advanced (every array remains at its final element for scan direction). */ static bool -_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, + bool *skip_array_set) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -985,6 +988,9 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) BTArrayKeyInfo *array = &so->arrayKeys[i]; ScanKey skey = &so->keyData[array->scan_key]; + if (array->num_elems == -1) + *skip_array_set = true; + if (ScanDirectionIsForward(dir)) { if (_bt_array_increment(rel, skey, array)) @@ -1460,6 +1466,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, ScanDirection dir = so->currPos.dir; int arrayidx = 0; bool beyond_end_advance = false, + skip_array_advanced = false, has_required_opposite_direction_only = false, all_required_satisfied = true, all_satisfied = true; @@ -1756,6 +1763,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ _bt_skiparray_set_element(rel, cur, array, result, tupdatum, tupnull); + skip_array_advanced = true; } else if (array->cur_elem != set_elem) { @@ -1772,11 +1780,19 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * higher-order arrays (might exhaust all the scan's arrays instead, which * ends the top-level scan). */ - if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir)) + if (beyond_end_advance && + !_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced)) goto end_toplevel_scan; Assert(_bt_verify_keys_with_arraykeys(scan)); + /* + * Maintain a page-level count of the number of times the scan's array + * keys advanced in a way that affected at least one skip array + */ + if (sktrig_required && skip_array_advanced) + pstate->nskipadvances++; + /* * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. * @@ -1946,26 +1962,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * Being pessimistic would also give some scans with non-required arrays a * perverse advantage over similar scans that use required arrays instead. * - * You can think of this as a speculative bet on what the scan is likely - * to find on the next page. It's not much of a gamble, though, since the - * untruncated prefix of attributes must strictly satisfy the new qual. + * This is similar to our scan-level heuristics, below. They also set + * scanBehind to speculatively continue the primscan onto the next page. */ if (so->scanBehind) { - /* - * Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled. - * - * Remember if recheck needs to call _bt_oppodir_checkkeys for next - * page's finaltup (see below comments about "Handle inequalities - * marked required in the opposite scan direction" for why). - */ - so->oppositeDirCheck = has_required_opposite_direction_only; - - /* - * Make sure that any SAOP arrays that were not marked required by - * preprocessing are reset to their first element for this direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); + /* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */ } /* @@ -2006,6 +2008,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, else if (has_required_opposite_direction_only && pstate->finaltup && unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) { + /* + * Make sure that any SAOP arrays that were not marked required by + * preprocessing are reset to their first element for this direction + */ _bt_rewind_nonrequired_arrays(scan, dir); goto new_prim_scan; } @@ -2032,11 +2038,21 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (so->scanBehind) { - /* Optimization: skip by setting "look ahead" mechanism's offnum */ + /* + * Remember if recheck needs to call _bt_oppodir_checkkeys for next + * page's finaltup (see above comments about "Handle inequalities + * marked required in the opposite scan direction" for why). + */ + so->oppositeDirCheck = has_required_opposite_direction_only; + + _bt_rewind_nonrequired_arrays(scan, dir); + + /* + * skip by setting "look ahead" mechanism's offnum for forwards scans + * (backwards scans check scanBehind flag directly instead) + */ if (ScanDirectionIsForward(dir)) pstate->skip = pstate->maxoff + 1; - else - pstate->skip = pstate->minoff - 1; } /* Caller's tuple doesn't match the new qual */ @@ -2059,19 +2075,31 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * This will in turn encourage _bt_readpage to apply the pstate.startikey * optimization more often. * - * Note: This heuristic isn't as aggressive as you might think. We're + * Also continue the ongoing primitive index scan when it is still on the + * first page if there have been more than NSKIPADVANCES_THRESHOLD calls + * here that each advanced at least one of the scan's skip arrays + * (deliberately ignore advancements that only affected SAOP arrays here). + * A page that cycles through this many skip array elements is quite + * likely to neighbor similar pages, that we'll also need to read. + * + * Note: These heuristics aren't as aggressive as you might think. We're * conservative about allowing a primitive scan to step from the first * leaf page it reads to the page's sibling page (we only allow it on - * first pages whose finaltup strongly suggests that it'll work out). + * first pages whose finaltup strongly suggests that it'll work out, as + * well as first pages that have a large number of skip array advances). * Clearing this first page finaltup hurdle is a strong signal in itself. + * + * Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid + * pathological cases. Specifically, cases where a skip scan should just + * behave like a traditional full index scan, but ends up "skipping" again + * and again, descending to the prior leaf page's direct sibling leaf page + * each time. This misbehavior would otherwise be possible during scans + * that never quite manage to "clear the first page finaltup hurdle". */ - if (!pstate->firstpage) + if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD) { /* Schedule a recheck once on the next (or previous) page */ so->scanBehind = true; - so->oppositeDirCheck = has_required_opposite_direction_only; - - _bt_rewind_nonrequired_arrays(scan, dir); /* Continue the current primitive scan after all */ goto continue_scan; @@ -2443,7 +2471,9 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, * the first page (first for the current primitive scan) avoids wasting cycles * during selective point queries. They typically don't stand to gain as much * when we can set pstate.startikey, and are likely to notice the overhead of - * calling here. + * calling here. (Also, allowing pstate.forcenonrequired to be set on a + * primscan's first page would mislead _bt_advance_array_keys, which expects + * pstate.nskipadvances to be representative of any first page's key space .) * * Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys * call for pstate.finaltup iff we set forcenonrequired=true. This will give diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index c8708f2fdc23..02da569958e2 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1118,10 +1118,11 @@ typedef struct BTReadPageState /* * Private _bt_checkkeys state used to manage "look ahead" optimization - * (only used during scans with array keys) + * and primscan scheduling (only used during scans with array keys) */ int16 rechecks; int16 targetdistance; + int16 nskipadvances; } BTReadPageState; From a0a9db424bb77e156822cc337080ffa0d2fd1b34 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 13 Jan 2025 16:08:32 -0500 Subject: [PATCH 4/5] Apply low-order skip key in _bt_first more often. Convert low_compare and high_compare nbtree skip array inequalities (with opclasses that offer skip support) such that _bt_first is consistently able to use later keys when descending the tree within _bt_first. For example, an index qual "WHERE a > 5 AND b = 2" is now converted to "WHERE a >= 6 AND b = 2" by a new preprocessing step that takes place after a final low_compare and/or high_compare are chosen by all earlier preprocessing steps. That way the scan's initial call to _bt_first will use "WHERE a >= 6 AND b = 2" to find the initial leaf level position, rather than merely using "WHERE a > 5" -- "b = 2" can always be applied. This has a decent chance of making the scan avoid an extra _bt_first call that would otherwise be needed just to determine the lowest-sorting "a" value in the index (the lowest that still satisfies "WHERE a > 5"). The transformation process can only lower the total number of index pages read when the use of a more restrictive set of initial positioning keys in _bt_first actually allows the scan to land on some later leaf page directly, relative to the unoptimized case (or on an earlier leaf page directly, when scanning backwards). The savings can be far greater when affected skip arrays come after some higher-order array. For example, a qual "WHERE x IN (1, 2, 3) AND y > 5 AND z = 2" can now save as many as 3 _bt_first calls as a result of these transformations (there can be as many as 1 _bt_first call saved per "x" array element). Author: Peter Geoghegan Reviewed-By: Matthias van de Meent Discussion: https://postgr.es/m/CAH2-Wz=FJ78K3WsF3iWNxWnUCY9f=Jdg3QPxaXE=uYUbmuRz5Q@mail.gmail.com --- src/backend/access/nbtree/nbtpreprocesskeys.c | 180 ++++++++++++++++++ src/test/regress/expected/create_index.out | 21 ++ src/test/regress/sql/create_index.sql | 10 + 3 files changed, 211 insertions(+) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 339092dfa678..0947eb0a667e 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -50,6 +50,12 @@ static bool _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array, bool *qual_ok); static bool _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, BTArrayKeyInfo *array, bool *qual_ok); +static void _bt_skiparray_strat_adjust(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array); +static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array); +static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops, @@ -1295,6 +1301,171 @@ _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, BTArrayKeyInfo *array, return true; } +/* + * Applies the opfamily's skip support routine to convert the skip array's > + * low_compare key (if any) into a >= key, and to convert its < high_compare + * key (if any) into a <= key. Decrements the high_compare key's sk_argument, + * and/or increments the low_compare key's sk_argument (also adjusts their + * operator strategies, while changing the operator as appropriate). + * + * This optional optimization reduces the number of descents required within + * _bt_first. Whenever _bt_first is called with a skip array whose current + * array element is the sentinel value MINVAL, using a transformed >= key + * instead of using the original > key makes it safe to include lower-order + * scan keys in the insertion scan key (there must be lower-order scan keys + * after the skip array). We will avoid an extra _bt_first to find the first + * value in the index > sk_argument -- at least when the first real matching + * value in the index happens to be an exact match for the sk_argument value + * that we produced here by incrementing the original input key's sk_argument. + * (Backwards scans derive the same benefit when they encounter the sentinel + * value MAXVAL, by converting the high_compare key from < to <=.) + * + * Note: The transformation is only correct when it cannot allow the scan to + * overlook matching tuples, but we don't have enough semantic information to + * safely make sure that can't happen during scans with cross-type operators. + * That's why we'll never apply the transformation in cross-type scenarios. + * For example, if we attempted to convert "sales_ts > '2024-01-01'::date" + * into "sales_ts >= '2024-01-02'::date" given a "sales_ts" attribute whose + * input opclass is timestamp_ops, the scan would overlook _all_ tuples for + * sales that fell on '2024-01-01'. + * + * Note: We can safely modify array->low_compare/array->high_compare in place + * because they just point to copies of our scan->keyData[] input scan keys + * (namely the copies returned by _bt_preprocess_array_keys to be used as + * input into the standard preprocessing steps in _bt_preprocess_keys). + * Everything will be reset if there's a rescan. + */ +static void +_bt_skiparray_strat_adjust(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + MemoryContext oldContext; + + /* + * Called last among all preprocessing steps, when the skip array's final + * low_compare and high_compare have both been chosen + */ + Assert(arraysk->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1 && !array->null_elem && array->sksup); + + oldContext = MemoryContextSwitchTo(so->arrayContext); + + if (array->high_compare && + array->high_compare->sk_strategy == BTLessStrategyNumber) + _bt_skiparray_strat_decrement(scan, arraysk, array); + + if (array->low_compare && + array->low_compare->sk_strategy == BTGreaterStrategyNumber) + _bt_skiparray_strat_increment(scan, arraysk, array); + + MemoryContextSwitchTo(oldContext); +} + +/* + * Convert skip array's > low_compare key into a >= key + */ +static void +_bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array) +{ + Relation rel = scan->indexRelation; + Oid opfamily = rel->rd_opfamily[arraysk->sk_attno - 1], + opcintype = rel->rd_opcintype[arraysk->sk_attno - 1], + leop; + RegProcedure cmp_proc; + ScanKey high_compare = array->high_compare; + Datum orig_sk_argument = high_compare->sk_argument, + new_sk_argument; + bool uflow; + + Assert(high_compare->sk_strategy == BTLessStrategyNumber); + + /* + * Only perform the transformation when the operator type matches the + * index attribute's input opclass type + */ + if (high_compare->sk_subtype != opcintype && + high_compare->sk_subtype != InvalidOid) + return; + + /* Decrement, handling underflow by marking the qual unsatisfiable */ + new_sk_argument = array->sksup->decrement(rel, orig_sk_argument, &uflow); + if (uflow) + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + so->qual_ok = false; + return; + } + + /* Look up <= operator (might fail) */ + leop = get_opfamily_member(opfamily, opcintype, opcintype, + BTLessEqualStrategyNumber); + if (!OidIsValid(leop)) + return; + cmp_proc = get_opcode(leop); + if (RegProcedureIsValid(cmp_proc)) + { + /* Transform < high_compare key into <= key */ + fmgr_info(cmp_proc, &high_compare->sk_func); + high_compare->sk_argument = new_sk_argument; + high_compare->sk_strategy = BTLessEqualStrategyNumber; + } +} + +/* + * Convert skip array's < low_compare key into a <= key + */ +static void +_bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, + BTArrayKeyInfo *array) +{ + Relation rel = scan->indexRelation; + Oid opfamily = rel->rd_opfamily[arraysk->sk_attno - 1], + opcintype = rel->rd_opcintype[arraysk->sk_attno - 1], + geop; + RegProcedure cmp_proc; + ScanKey low_compare = array->low_compare; + Datum orig_sk_argument = low_compare->sk_argument, + new_sk_argument; + bool oflow; + + Assert(low_compare->sk_strategy == BTGreaterStrategyNumber); + + /* + * Only perform the transformation when the operator type matches the + * index attribute's input opclass type + */ + if (low_compare->sk_subtype != opcintype && + low_compare->sk_subtype != InvalidOid) + return; + + /* Increment, handling overflow by marking the qual unsatisfiable */ + new_sk_argument = array->sksup->increment(rel, orig_sk_argument, &oflow); + if (oflow) + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + so->qual_ok = false; + return; + } + + /* Look up >= operator (might fail) */ + geop = get_opfamily_member(opfamily, opcintype, opcintype, + BTGreaterEqualStrategyNumber); + if (!OidIsValid(geop)) + return; + cmp_proc = get_opcode(geop); + if (RegProcedureIsValid(cmp_proc)) + { + /* Transform > low_compare key into >= key */ + fmgr_info(cmp_proc, &low_compare->sk_func); + low_compare->sk_argument = new_sk_argument; + low_compare->sk_strategy = BTGreaterEqualStrategyNumber; + } +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * @@ -1838,6 +2009,15 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) } else { + /* + * Any skip array low_compare and high_compare scan keys + * are now final. Transform the array's > low_compare key + * into a >= key (and < high_compare keys into a <= key). + */ + if (array->num_elems == -1 && array->sksup && + !array->null_elem) + _bt_skiparray_strat_adjust(scan, outkey, array); + /* Match found, so done with this array */ arrayidx++; } diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 2cfb26699bef..9ade7b835e69 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2589,6 +2589,27 @@ ORDER BY thousand; 1 | 1001 (1 row) +-- Skip array preprocessing increments "thousand > -1" to "thousand >= 0" +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 AND tenthous IN (1001,3000) +ORDER BY thousand limit 2; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Limit + -> Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand > '-1'::integer) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(3 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 AND tenthous IN (1001,3000) +ORDER BY thousand limit 2; + thousand | tenthous +----------+---------- + 0 | 3000 + 1 | 1001 +(2 rows) + -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index cd90b1c3a8f2..e21ff426519b 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -993,6 +993,16 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001 ORDER BY thousand; +-- Skip array preprocessing increments "thousand > -1" to "thousand >= 0" +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 AND tenthous IN (1001,3000) +ORDER BY thousand limit 2; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand > -1 AND tenthous IN (1001,3000) +ORDER BY thousand limit 2; + -- -- Check elimination of constant-NULL subexpressions -- From 055e60df31911a6ec5b0ea0282bc7c9711446a2d Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 18 Jan 2025 10:54:44 -0500 Subject: [PATCH 5/5] DEBUG: Add skip scan disable GUCs. --- src/backend/access/nbtree/nbtpreprocesskeys.c | 37 +++++++++++++++++++ src/backend/access/nbtree/nbtutils.c | 3 ++ src/backend/utils/misc/guc_tables.c | 34 +++++++++++++++++ src/include/access/nbtree.h | 5 +++ 4 files changed, 79 insertions(+) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 0947eb0a667e..6501ef6209e8 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -21,6 +21,33 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" +/* + * GUC parameters (temporary convenience for reviewers). + * + * To disable all skipping, set skipscan_prefix_cols=0. Otherwise set it to + * the attribute number that you wish to make the last attribute number that + * we can add a skip scan key for. For example, skipscan_prefix_cols=1 makes + * an index scan with qual "WHERE b = 1 AND d = 42" generate a skip scan key + * on the column 'a' (which is attnum 1) only, preventing us from adding one + * for the column 'c'. And so only the scan key on 'b' (not the one on 'd') + * gets marked required within _bt_preprocess_keys -- there is no 'c' skip + * array to "anchor the required-ness" of 'b' through 'c' into 'd'. + */ +int skipscan_prefix_cols = INDEX_MAX_KEYS; + +/* + * skipscan_skipsupport_enabled can be used to avoid using skip support. Used + * to quantify the performance benefit that comes from having dedicated skip + * support, with a given opclass and test query. + */ +bool skipscan_skipsupport_enabled = true; + +/* + * skipscan_iprefix_enabled can be used to disable optimizations used when the + * maintenance overhead of skip arrays stops paying for itself + */ +bool skipscan_iprefix_enabled = true; + typedef struct BTScanKeyPreproc { ScanKey inkey; @@ -1650,6 +1677,10 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) so->arrayKeys[numArrayKeys].low_compare = NULL; /* for now */ so->arrayKeys[numArrayKeys].high_compare = NULL; /* for now */ + /* Temporary testing GUC can disable the use of skip support */ + if (!skipscan_skipsupport_enabled) + so->arrayKeys[numArrayKeys].sksup = NULL; + /* * We'll need a 3-way ORDER proc. Set that up now. */ @@ -2175,6 +2206,12 @@ _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops, int *numSkipArrayKeys) if (attno_has_rowcompare) break; + /* + * Apply temporary testing GUC that can be used to disable skipping + */ + if (attno_inkey > skipscan_prefix_cols) + break; + /* * Now consider next attno_inkey (or keep going if this is an * additional scan key against the same attribute) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2d060e185ee6..38f79983fd04 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2505,6 +2505,9 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) if (so->numberOfKeys == 0) return; + if (!skipscan_iprefix_enabled) + return; + /* minoff is an offset to the lowest non-pivot tuple on the page */ iid = PageGetItemId(pstate->page, pstate->minoff); firsttup = (IndexTuple) PageGetItem(pstate->page, iid); diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4eaeca89f2c7..eb46c24c444e 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/slru.h" #include "access/toast_compression.h" #include "access/twophase.h" @@ -1788,6 +1789,28 @@ struct config_bool ConfigureNamesBool[] = }, #endif + /* XXX Remove before commit */ + { + {"skipscan_skipsupport_enabled", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_skipsupport_enabled, + true, + NULL, NULL, NULL + }, + + /* XXX Remove before commit */ + { + {"skipscan_iprefix_enabled", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_iprefix_enabled, + true, + NULL, NULL, NULL + }, + { {"integer_datetimes", PGC_INTERNAL, PRESET_OPTIONS, gettext_noop("Shows whether datetimes are integer based."), @@ -3724,6 +3747,17 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* XXX Remove before commit */ + { + {"skipscan_prefix_cols", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_prefix_cols, + INDEX_MAX_KEYS, 0, INDEX_MAX_KEYS, + NULL, NULL, NULL + }, + { /* Can't be set in postgresql.conf */ {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 02da569958e2..ca4b6ead49be 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1178,6 +1178,11 @@ typedef struct BTOptions #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 +/* GUC parameters (just a temporary convenience for reviewers) */ +extern PGDLLIMPORT int skipscan_prefix_cols; +extern PGDLLIMPORT bool skipscan_skipsupport_enabled; +extern PGDLLIMPORT bool skipscan_iprefix_enabled; + /* * external entry points for btree, in nbtree.c */