Skip to content

Commit bc2aca2

Browse files
author
Commitfest Bot
committed
[CF 5738] v2 - Improve hash join's handling of tuples with null join keys
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5738 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/[email protected] Author(s): Tom Lane
2 parents c9e38a5 + 8736a6b commit bc2aca2

File tree

13 files changed

+381
-71
lines changed

13 files changed

+381
-71
lines changed

src/backend/executor/execExpr.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4282,25 +4282,27 @@ ExecBuildHash32FromAttrs(TupleDesc desc, const TupleTableSlotOps *ops,
42824282
* 'hash_exprs'. When multiple expressions are present, the hash values
42834283
* returned by each hash function are combined to produce a single hash value.
42844284
*
4285+
* If any hash_expr yields NULL and the corresponding hash operator is strict,
4286+
* the created ExprState will return NULL. (If the operator is not strict,
4287+
* we treat NULL values as having a hash value of zero. The hash functions
4288+
* themselves are always treated as strict.)
4289+
*
42854290
* desc: tuple descriptor for the to-be-hashed expressions
42864291
* ops: TupleTableSlotOps for the TupleDesc
42874292
* hashfunc_oids: Oid for each hash function to call, one for each 'hash_expr'
4288-
* collations: collation to use when calling the hash function.
4289-
* hash_expr: list of expressions to hash the value of
4290-
* opstrict: array corresponding to the 'hashfunc_oids' to store op_strict()
4293+
* collations: collation to use when calling the hash function
4294+
* hash_exprs: list of expressions to hash the value of
4295+
* opstrict: strictness flag for each hash function's comparison operator
42914296
* parent: PlanState node that the 'hash_exprs' will be evaluated at
42924297
* init_value: Normally 0, but can be set to other values to seed the hash
42934298
* with some other value. Using non-zero is slightly less efficient but can
42944299
* be useful.
4295-
* keep_nulls: if true, evaluation of the returned ExprState will abort early
4296-
* returning NULL if the given hash function is strict and the Datum to hash
4297-
* is null. When set to false, any NULL input Datums are skipped.
42984300
*/
42994301
ExprState *
43004302
ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops,
43014303
const Oid *hashfunc_oids, const List *collations,
43024304
const List *hash_exprs, const bool *opstrict,
4303-
PlanState *parent, uint32 init_value, bool keep_nulls)
4305+
PlanState *parent, uint32 init_value)
43044306
{
43054307
ExprState *state = makeNode(ExprState);
43064308
ExprEvalStep scratch = {0};
@@ -4377,8 +4379,8 @@ ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops,
43774379
fmgr_info(funcid, finfo);
43784380

43794381
/*
4380-
* Build the steps to evaluate the hash function's argument have it so
4381-
* the value of that is stored in the 0th argument of the hash func.
4382+
* Build the steps to evaluate the hash function's argument, placing
4383+
* the value in the 0th argument of the hash func.
43824384
*/
43834385
ExecInitExprRec(expr,
43844386
state,
@@ -4413,7 +4415,7 @@ ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops,
44134415
scratch.d.hashdatum.fcinfo_data = fcinfo;
44144416
scratch.d.hashdatum.fn_addr = finfo->fn_addr;
44154417

4416-
scratch.opcode = opstrict[i] && !keep_nulls ? strict_opcode : opcode;
4418+
scratch.opcode = opstrict[i] ? strict_opcode : opcode;
44174419
scratch.d.hashdatum.jumpdone = -1;
44184420

44194421
ExprEvalPushStep(state, &scratch);

src/backend/executor/nodeHash.c

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,11 @@ MultiExecPrivateHash(HashState *node)
153153
econtext = node->ps.ps_ExprContext;
154154

155155
/*
156-
* Get all tuples from the node below the Hash node and insert into the
157-
* hash table (or temp files).
156+
* Get all tuples from the node below the Hash node and insert the
157+
* potentially-matchable ones into the hash table (or temp files). Tuples
158+
* that can't possibly match because they have null join keys are dumped
159+
* into a separate tuplestore, or just summarily discarded if we don't
160+
* need to emit them with null-extension.
158161
*/
159162
for (;;)
160163
{
@@ -174,6 +177,7 @@ MultiExecPrivateHash(HashState *node)
174177

175178
if (!isnull)
176179
{
180+
/* normal case with a non-null join key */
177181
uint32 hashvalue = DatumGetUInt32(hashdatum);
178182
int bucketNumber;
179183

@@ -192,6 +196,14 @@ MultiExecPrivateHash(HashState *node)
192196
}
193197
hashtable->totalTuples += 1;
194198
}
199+
else if (node->keep_null_tuples)
200+
{
201+
/* null join key, but we must save tuple to be emitted later */
202+
if (node->null_tuple_store == NULL)
203+
node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
204+
tuplestore_puttupleslot(node->null_tuple_store, slot);
205+
}
206+
/* else we can discard the tuple immediately */
195207
}
196208

197209
/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
@@ -222,7 +234,6 @@ MultiExecParallelHash(HashState *node)
222234
HashJoinTable hashtable;
223235
TupleTableSlot *slot;
224236
ExprContext *econtext;
225-
uint32 hashvalue;
226237
Barrier *build_barrier;
227238
int i;
228239

@@ -282,6 +293,7 @@ MultiExecParallelHash(HashState *node)
282293
for (;;)
283294
{
284295
bool isnull;
296+
uint32 hashvalue;
285297

286298
slot = ExecProcNode(outerNode);
287299
if (TupIsNull(slot))
@@ -295,8 +307,19 @@ MultiExecParallelHash(HashState *node)
295307
&isnull));
296308

297309
if (!isnull)
310+
{
311+
/* normal case with a non-null join key */
298312
ExecParallelHashTableInsert(hashtable, slot, hashvalue);
299-
hashtable->partialTuples++;
313+
hashtable->partialTuples++;
314+
}
315+
else if (node->keep_null_tuples)
316+
{
317+
/* null join key, but save tuple to be emitted later */
318+
if (node->null_tuple_store == NULL)
319+
node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
320+
tuplestore_puttupleslot(node->null_tuple_store, slot);
321+
}
322+
/* else we can discard the tuple immediately */
300323
}
301324

302325
/*
@@ -404,14 +427,10 @@ ExecInitHash(Hash *node, EState *estate, int eflags)
404427

405428
Assert(node->plan.qual == NIL);
406429

407-
/*
408-
* Delay initialization of hash_expr until ExecInitHashJoin(). We cannot
409-
* build the ExprState here as we don't yet know the join type we're going
410-
* to be hashing values for and we need to know that before calling
411-
* ExecBuildHash32Expr as the keep_nulls parameter depends on the join
412-
* type.
413-
*/
430+
/* these fields will be filled by ExecInitHashJoin() */
414431
hashstate->hash_expr = NULL;
432+
hashstate->null_tuple_store = NULL;
433+
hashstate->keep_null_tuples = false;
415434

416435
return hashstate;
417436
}
@@ -2753,6 +2772,31 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
27532772
}
27542773
}
27552774

2775+
/*
2776+
* Build a tuplestore suitable for holding null-keyed input tuples.
2777+
* (This function doesn't care whether it's for outer or inner tuples.)
2778+
*
2779+
* Note that in a parallel hash join, each worker has its own tuplestore(s)
2780+
* for these. There's no need to interact with other workers to decide
2781+
* what to do with them. So they're always in private storage.
2782+
*/
2783+
Tuplestorestate *
2784+
ExecHashBuildNullTupleStore(HashJoinTable hashtable)
2785+
{
2786+
Tuplestorestate *tstore;
2787+
MemoryContext oldcxt;
2788+
2789+
/*
2790+
* We keep the tuplestore in the hashCxt to ensure it won't go away too
2791+
* soon. Size it at work_mem/16 so that it doesn't bloat the node's space
2792+
* consumption too much.
2793+
*/
2794+
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
2795+
tstore = tuplestore_begin_heap(false, false, work_mem / 16);
2796+
MemoryContextSwitchTo(oldcxt);
2797+
return tstore;
2798+
}
2799+
27562800
/*
27572801
* Reserve space in the DSM segment for instrumentation data.
27582802
*/

0 commit comments

Comments
 (0)