Skip to content

Commit 7c8bd58

Browse files
amitlanCommitfest Bot
authored andcommitted
Explicitly pass snapshot necessary for omit_detached logic
This commit changes find_inheritance_children_extended() and RelationBuildPartitionDesc() to accept the snapshot necessary to implement the omit_detach logic correctly. Previously, these functions used ActiveSnapshot to check if a detach-pending partition's pg_inherits row was visible. This logic aimed to make RI queries over partitioned PK tables under REPEATABLE READ isolation handle detach-pending partitions correctly. However, forcing a snapshot onto ActiveSnapshot led to isolation violations by making scans in the query see changes not consistent with the parent transaction's snapshot. A test added in commit 00cb86e demonstrates this issue. The new interface of RelationBuildPartitionDesc() and relevant callers allows passing the necessary snapshot explcitly, thus avoiding modifications to ActiveSnapshot. Default behavior remains unchanged when no snapshot is provided, maintaining compatibility with non-RI queries and other uses of find_inheritance_children_extended(). A future commit will update RI PK lookups to use this interface. Robert Haas contributed the changes to PartitionDesc interface. Co-author: Robert Haas Reviewed-by: Robert Haas Discussion: https://postgr.es/m/CA+HiwqGkfJfYdeq5vHPh6eqPKjSbfpDDY+j-kXYFePQedtSLeg@mail.gmail.com Discussion: https://postgr.es/m/CA+HiwqG5e8pk8s7+7zhr1Nc_PGyhEdM5f=pHkMOdK1RYWXfJsg@mail.gmail.com
1 parent d611f8b commit 7c8bd58

File tree

6 files changed

+109
-60
lines changed

6 files changed

+109
-60
lines changed

src/backend/catalog/pg_inherits.c

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,18 @@ typedef struct SeenRelsEntry
5151
* then no locks are acquired, but caller must beware of race conditions
5252
* against possible DROPs of child relations.
5353
*
54-
* Partitions marked as being detached are omitted; see
54+
* A partition marked as being detached is omitted from the result if the
55+
* pg_inherits row showing the partition as being detached is visible to
56+
* ActiveSnapshot, doing so only when one has been pushed; see
5557
* find_inheritance_children_extended for details.
5658
*/
5759
List *
5860
find_inheritance_children(Oid parentrelId, LOCKMODE lockmode)
5961
{
60-
return find_inheritance_children_extended(parentrelId, true, lockmode,
61-
NULL, NULL);
62+
return find_inheritance_children_extended(parentrelId,
63+
ActiveSnapshotSet() ?
64+
GetActiveSnapshot() : NULL,
65+
lockmode, NULL, NULL);
6266
}
6367

6468
/*
@@ -70,16 +74,17 @@ find_inheritance_children(Oid parentrelId, LOCKMODE lockmode)
7074
* If a partition's pg_inherits row is marked "detach pending",
7175
* *detached_exist (if not null) is set true.
7276
*
73-
* If omit_detached is true and there is an active snapshot (not the same as
74-
* the catalog snapshot used to scan pg_inherits!) and a pg_inherits tuple
75-
* marked "detach pending" is visible to that snapshot, then that partition is
76-
* omitted from the output list. This makes partitions invisible depending on
77-
* whether the transaction that marked those partitions as detached appears
78-
* committed to the active snapshot. In addition, *detached_xmin (if not null)
79-
* is set to the xmin of the row of the detached partition.
77+
* If the caller passed 'omit_detached_snapshot', the partition whose
78+
* pg_inherits tuple marks it as "detach pending" is omitted from the output
79+
* list if the tuple is visible to that snapshot. That is, such a partition
80+
* is omitted from the output list depending on whether the transaction that
81+
* marked that partition as detached appears committed to
82+
* omit_detached_snapshot. If omitted, *detached_xmin (if non NULL) is set
83+
* to the xmin of that pg_inherits tuple.
8084
*/
8185
List *
82-
find_inheritance_children_extended(Oid parentrelId, bool omit_detached,
86+
find_inheritance_children_extended(Oid parentrelId,
87+
Snapshot omit_detached_snapshot,
8388
LOCKMODE lockmode, bool *detached_exist,
8489
TransactionId *detached_xmin)
8590
{
@@ -140,15 +145,13 @@ find_inheritance_children_extended(Oid parentrelId, bool omit_detached,
140145
if (detached_exist)
141146
*detached_exist = true;
142147

143-
if (omit_detached && ActiveSnapshotSet())
148+
if (omit_detached_snapshot)
144149
{
145150
TransactionId xmin;
146-
Snapshot snap;
147151

148152
xmin = HeapTupleHeaderGetXmin(inheritsTuple->t_data);
149-
snap = GetActiveSnapshot();
150153

151-
if (!XidInMVCCSnapshot(xmin, snap))
154+
if (!XidInMVCCSnapshot(xmin, omit_detached_snapshot))
152155
{
153156
if (detached_xmin)
154157
{

src/backend/executor/execPartition.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "utils/partcache.h"
3333
#include "utils/rls.h"
3434
#include "utils/ruleutils.h"
35+
#include "utils/snapmgr.h"
3536

3637

3738
/*-----------------------
@@ -1105,17 +1106,24 @@ ExecInitPartitionDispatchInfo(EState *estate,
11051106
MemoryContext oldcxt;
11061107

11071108
/*
1108-
* For data modification, it is better that executor does not include
1109-
* partitions being detached, except when running in snapshot-isolation
1110-
* mode. This means that a read-committed transaction immediately gets a
1109+
* For data modification, it is better that executor omits the partitions
1110+
* being detached, except when running in snapshot-isolation mode. This
1111+
* means that a read-committed transaction immediately gets a
11111112
* "no partition for tuple" error when a tuple is inserted into a
11121113
* partition that's being detached concurrently, but a transaction in
11131114
* repeatable-read mode can still use such a partition.
11141115
*/
11151116
if (estate->es_partition_directory == NULL)
1117+
{
1118+
Snapshot omit_detached_snapshot = NULL;
1119+
1120+
Assert(ActiveSnapshotSet());
1121+
if (!IsolationUsesXactSnapshot())
1122+
omit_detached_snapshot = GetActiveSnapshot();
11161123
estate->es_partition_directory =
11171124
CreatePartitionDirectory(estate->es_query_cxt,
1118-
!IsolationUsesXactSnapshot());
1125+
omit_detached_snapshot);
1126+
}
11191127

11201128
oldcxt = MemoryContextSwitchTo(proute->memcxt);
11211129

@@ -2011,10 +2019,13 @@ CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo,
20112019
*/
20122020
ExprContext *econtext = CreateExprContext(estate);
20132021

2014-
/* For data reading, executor always includes detached partitions */
2022+
/*
2023+
* For data reading, executor always includes detached partitions,
2024+
* so pass NULL for omit_detached_snapshot.
2025+
*/
20152026
if (estate->es_partition_directory == NULL)
20162027
estate->es_partition_directory =
2017-
CreatePartitionDirectory(estate->es_query_cxt, false);
2028+
CreatePartitionDirectory(estate->es_query_cxt, NULL);
20182029

20192030
n_part_hierarchies = list_length(pruneinfo->prune_infos);
20202031
Assert(n_part_hierarchies > 0);

src/backend/optimizer/util/plancat.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2389,11 +2389,15 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
23892389

23902390
/*
23912391
* Create the PartitionDirectory infrastructure if we didn't already.
2392+
* Note that the planner always omits the partitions being detached
2393+
* concurrently.
23922394
*/
23932395
if (root->glob->partition_directory == NULL)
23942396
{
2397+
Assert(ActiveSnapshotSet());
23952398
root->glob->partition_directory =
2396-
CreatePartitionDirectory(CurrentMemoryContext, true);
2399+
CreatePartitionDirectory(CurrentMemoryContext,
2400+
GetActiveSnapshot());
23972401
}
23982402

23992403
partdesc = PartitionDirectoryLookup(root->glob->partition_directory,

src/backend/partitioning/partdesc.c

Lines changed: 59 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ typedef struct PartitionDirectoryData
3636
{
3737
MemoryContext pdir_mcxt;
3838
HTAB *pdir_hash;
39-
bool omit_detached;
39+
Snapshot omit_detached_snapshot;
4040
} PartitionDirectoryData;
4141

4242
typedef struct PartitionDirectoryEntry
@@ -47,17 +47,23 @@ typedef struct PartitionDirectoryEntry
4747
} PartitionDirectoryEntry;
4848

4949
static PartitionDesc RelationBuildPartitionDesc(Relation rel,
50-
bool omit_detached);
50+
Snapshot omit_detached_snapshot);
5151

5252

5353
/*
54-
* RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
54+
* RelationGetPartitionDescExt
55+
* Get partition descriptor of a partitioned table, building one and
56+
* caching it for later use if not already or if the cached one would
57+
* not be suitable for a given request
5558
*
5659
* We keep two partdescs in relcache: rd_partdesc includes all partitions
57-
* (even those being concurrently marked detached), while rd_partdesc_nodetached
58-
* omits (some of) those. We store the pg_inherits.xmin value for the latter,
59-
* to determine whether it can be validly reused in each case, since that
60-
* depends on the active snapshot.
60+
* (even the one being concurrently marked detached), while
61+
* rd_partdesc_nodetached omits the detach-pending partition. If the latter one
62+
* is present, rd_partdesc_nodetach_xmin would have been set to the xmin of
63+
* the detach-pending partition's pg_inherits row, which is used to determine
64+
* whether rd_partdesc_nodetach can be validly reused for a given request by
65+
* checking if the xmin appears visible to 'omit_detached_snapshot' passed by
66+
* the caller.
6167
*
6268
* Note: we arrange for partition descriptors to not get freed until the
6369
* relcache entry's refcount goes to zero (see hacks in RelationClose,
@@ -68,7 +74,7 @@ static PartitionDesc RelationBuildPartitionDesc(Relation rel,
6874
* that the data doesn't become stale.
6975
*/
7076
PartitionDesc
71-
RelationGetPartitionDesc(Relation rel, bool omit_detached)
77+
RelationGetPartitionDescExt(Relation rel, Snapshot omit_detached_snapshot)
7278
{
7379
Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
7480

@@ -77,36 +83,51 @@ RelationGetPartitionDesc(Relation rel, bool omit_detached)
7783
* do so when we are asked to include all partitions including detached;
7884
* and also when we know that there are no detached partitions.
7985
*
80-
* If there is no active snapshot, detached partitions aren't omitted
81-
* either, so we can use the cached descriptor too in that case.
86+
* omit_detached_snapshot being NULL means that the caller doesn't care
87+
* that the returned partition descriptor may contain detached partitions,
88+
* so we we can used the cached descriptor in that case too.
8289
*/
8390
if (likely(rel->rd_partdesc &&
84-
(!rel->rd_partdesc->detached_exist || !omit_detached ||
85-
!ActiveSnapshotSet())))
91+
(!rel->rd_partdesc->detached_exist ||
92+
omit_detached_snapshot == NULL)))
8693
return rel->rd_partdesc;
8794

8895
/*
89-
* If we're asked to omit detached partitions, we may be able to use a
90-
* cached descriptor too. We determine that based on the pg_inherits.xmin
91-
* that was saved alongside that descriptor: if the xmin that was not in
92-
* progress for that active snapshot is also not in progress for the
93-
* current active snapshot, then we can use it. Otherwise build one from
94-
* scratch.
96+
* If we're asked to omit the detached partition, we may be able to use
97+
* the other cached descriptor, which has been made to omit the detached
98+
* partition. Whether that descriptor can be reused in this case is
99+
* determined based on cross-checking the visibility of
100+
* rd_partdesc_nodetached_xmin, that is, the pg_inherits.xmin of the
101+
* pg_inherits row of the detached partition: if the xmin seems in-progress
102+
* to both the given omit_detached_snapshot and to the snapshot that would
103+
* have been passed when rd_partdesc_nodetached was built, then we can
104+
* reuse it. Otherwise we must build one from scratch.
95105
*/
96-
if (omit_detached &&
97-
rel->rd_partdesc_nodetached &&
98-
ActiveSnapshotSet())
106+
if (rel->rd_partdesc_nodetached && omit_detached_snapshot)
99107
{
100-
Snapshot activesnap;
101-
102108
Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
103-
activesnap = GetActiveSnapshot();
104109

105-
if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
110+
if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin,
111+
omit_detached_snapshot))
106112
return rel->rd_partdesc_nodetached;
107113
}
108114

109-
return RelationBuildPartitionDesc(rel, omit_detached);
115+
return RelationBuildPartitionDesc(rel, omit_detached_snapshot);
116+
}
117+
118+
/*
119+
* RelationGetPartitionDesc
120+
* Like RelationGetPartitionDescExt() but for callers that are fine with
121+
* ActiveSnapshot being used as omit_detached_snapshot
122+
*/
123+
PartitionDesc
124+
RelationGetPartitionDesc(Relation rel, bool omit_detached)
125+
{
126+
Snapshot snapshot = NULL;
127+
128+
if (omit_detached && ActiveSnapshotSet())
129+
snapshot = GetActiveSnapshot();
130+
return RelationGetPartitionDescExt(rel, snapshot);
110131
}
111132

112133
/*
@@ -131,7 +152,8 @@ RelationGetPartitionDesc(Relation rel, bool omit_detached)
131152
* for them.
132153
*/
133154
static PartitionDesc
134-
RelationBuildPartitionDesc(Relation rel, bool omit_detached)
155+
RelationBuildPartitionDesc(Relation rel,
156+
Snapshot omit_detached_snapshot)
135157
{
136158
PartitionDesc partdesc;
137159
PartitionBoundInfo boundinfo = NULL;
@@ -162,7 +184,8 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
162184
detached_exist = false;
163185
detached_xmin = InvalidTransactionId;
164186
inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
165-
omit_detached, NoLock,
187+
omit_detached_snapshot,
188+
NoLock,
166189
&detached_exist,
167190
&detached_xmin);
168191

@@ -362,11 +385,11 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
362385
*
363386
* Note that if a partition was found by the catalog's scan to have been
364387
* detached, but the pg_inherit tuple saying so was not visible to the
365-
* active snapshot (find_inheritance_children_extended will not have set
366-
* detached_xmin in that case), we consider there to be no "omittable"
367-
* detached partitions.
388+
* omit_detached_snapshot (find_inheritance_children_extended() will not
389+
* have set detached_xmin in that case), we consider there to be no
390+
* "omittable" detached partitions.
368391
*/
369-
is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
392+
is_omit = detached_exist && omit_detached_snapshot &&
370393
TransactionIdIsValid(detached_xmin);
371394

372395
/*
@@ -420,7 +443,7 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
420443
* Create a new partition directory object.
421444
*/
422445
PartitionDirectory
423-
CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
446+
CreatePartitionDirectory(MemoryContext mcxt, Snapshot omit_detached_snapshot)
424447
{
425448
MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
426449
PartitionDirectory pdir;
@@ -435,7 +458,7 @@ CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
435458

436459
pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
437460
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
438-
pdir->omit_detached = omit_detached;
461+
pdir->omit_detached_snapshot = omit_detached_snapshot;
439462

440463
MemoryContextSwitchTo(oldcontext);
441464
return pdir;
@@ -468,7 +491,8 @@ PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
468491
*/
469492
RelationIncrementReferenceCount(rel);
470493
pde->rel = rel;
471-
pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
494+
pde->pd = RelationGetPartitionDescExt(rel,
495+
pdir->omit_detached_snapshot);
472496
Assert(pde->pd != NULL);
473497
}
474498
return pde->pd;

src/include/catalog/pg_inherits.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "nodes/pg_list.h"
2525
#include "storage/lock.h"
26+
#include "utils/snapshot.h"
2627

2728
/* ----------------
2829
* pg_inherits definition. cpp turns this into
@@ -49,8 +50,10 @@ DECLARE_INDEX(pg_inherits_parent_index, 2187, InheritsParentIndexId, pg_inherits
4950

5051

5152
extern List *find_inheritance_children(Oid parentrelId, LOCKMODE lockmode);
52-
extern List *find_inheritance_children_extended(Oid parentrelId, bool omit_detached,
53-
LOCKMODE lockmode, bool *detached_exist, TransactionId *detached_xmin);
53+
extern List *find_inheritance_children_extended(Oid parentrelId,
54+
Snapshot omit_detached_snapshot,
55+
LOCKMODE lockmode, bool *detached_exist,
56+
TransactionId *detached_xmin);
5457

5558
extern List *find_all_inheritors(Oid parentrelId, LOCKMODE lockmode,
5659
List **numparents);

src/include/partitioning/partdesc.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "partitioning/partdefs.h"
1616
#include "utils/relcache.h"
17+
#include "utils/snapshot.h"
1718

1819
/*
1920
* Information about partitions of a partitioned table.
@@ -65,8 +66,11 @@ typedef struct PartitionDescData
6566

6667

6768
extern PartitionDesc RelationGetPartitionDesc(Relation rel, bool omit_detached);
69+
extern PartitionDesc RelationGetPartitionDescExt(Relation rel,
70+
Snapshot omit_detached_snapshot);
6871

69-
extern PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached);
72+
extern PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt,
73+
Snapshot omit_detached_snapshot);
7074
extern PartitionDesc PartitionDirectoryLookup(PartitionDirectory, Relation);
7175
extern void DestroyPartitionDirectory(PartitionDirectory pdir);
7276

0 commit comments

Comments
 (0)