postgresql-cfbot
diff --git a/‎src/backend/executor/execGrouping.c‎
Lines changed: 43 additions & 23 deletions b/‎src/backend/executor/execGrouping.c‎
Lines changed: 43 additions & 23 deletions
diff --git a/‎src/backend/executor/nodeAgg.c‎
Lines changed: 15 additions & 17 deletions b/‎src/backend/executor/nodeAgg.c‎
Lines changed: 15 additions & 17 deletions
diff --git a/‎src/backend/executor/nodeRecursiveunion.c‎
Lines changed: 11 additions & 14 deletions b/‎src/backend/executor/nodeRecursiveunion.c‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎src/backend/executor/nodeSetOp.c‎
Lines changed: 12 additions & 14 deletions b/‎src/backend/executor/nodeSetOp.c‎
Lines changed: 12 additions & 14 deletions
@@ -145,8 +145,8 @@ execTuplesHashPrepare(int numCols,
  *	collations: collations to use in comparisons
  *	nbuckets: initial estimate of hashtable size
  *	additionalsize: size of data that may be stored along with the hash entry
- *	metacxt: memory context for long-lived allocation, but not per-entry data
- *	tablecxt: memory context in which to store table entries
+ *	metacxt: memory context for long-lived data and the simplehash table
+ *	tuplescxt: memory context in which to store the hashed tuples themselves
  *	tempcxt: short-lived context for evaluation hash and comparison functions
  *	use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
  *
@@ -157,11 +157,25 @@ execTuplesHashPrepare(int numCols,
  * Note that the keyColIdx, hashfunctions, and collations arrays must be
  * allocated in storage that will live as long as the hashtable does.
  *
+ * The metacxt and tuplescxt are separate because it's usually desirable for
+ * tuplescxt to be a BumpContext to avoid memory wastage, while metacxt must
+ * support pfree in case the simplehash table needs to be enlarged.  (We could
+ * simplify the API of TupleHashTables by managing the tuplescxt internally.
+ * But that would be disadvantageous to nodeAgg.c and nodeSubplan.c, which use
+ * a single tuplescxt for multiple TupleHashTables that are reset together.)
+ *
  * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak
  * memory in the tempcxt.  It is caller's responsibility to reset that context
  * reasonably often, typically once per tuple.  (We do it that way, rather
  * than managing an extra context within the hashtable, because in many cases
  * the caller can specify a tempcxt that it needs to reset per-tuple anyway.)
+ *
+ * We don't currently provide DestroyTupleHashTable functionality; the hash
+ * table will be cleaned up at destruction of the metacxt.  (Some callers
+ * bother to delete the tuplescxt explicitly, though it'd be sufficient to
+ * ensure it's a child of the metacxt.)  There's not much point in working
+ * harder than this so long as the expression-evaluation infrastructure
+ * behaves similarly.
  */
 TupleHashTable
 BuildTupleHashTable(PlanState *parent,
@@ -175,22 +189,32 @@ BuildTupleHashTable(PlanState *parent,
 					long nbuckets,
 					Size additionalsize,
 					MemoryContext metacxt,
-					MemoryContext tablecxt,
+					MemoryContext tuplescxt,
 					MemoryContext tempcxt,
 					bool use_variable_hash_iv)
 {
 	TupleHashTable hashtable;
 	Size		entrysize;
 	Size		hash_mem_limit;
 	MemoryContext oldcontext;
-	bool		allow_jit;
 	uint32		hash_iv = 0;
 
 	Assert(nbuckets > 0);
+
+	/* tuplescxt must be separate, else ResetTupleHashTable breaks things */
+	Assert(metacxt != tuplescxt);
+
+	/* ensure additionalsize is maxalign'ed */
 	additionalsize = MAXALIGN(additionalsize);
-	entrysize = sizeof(TupleHashEntryData) + additionalsize;
 
-	/* Limit initial table size request to not more than hash_mem */
+	/*
+	 * Limit initial table size request to not more than hash_mem.
+	 *
+	 * XXX this calculation seems pretty misguided, as it counts only overhead
+	 * and not the tuples themselves.  But we have no knowledge of the
+	 * expected tuple width here.
+	 */
+	entrysize = sizeof(TupleHashEntryData) + additionalsize;
 	hash_mem_limit = get_hash_memory_limit() / entrysize;
 	if (nbuckets > hash_mem_limit)
 		nbuckets = hash_mem_limit;
@@ -202,7 +226,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
 	hashtable->tab_collations = collations;
-	hashtable->tablecxt = tablecxt;
+	hashtable->tuplescxt = tuplescxt;
 	hashtable->tempcxt = tempcxt;
 	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
@@ -230,24 +254,14 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
 													&TTSOpsMinimalTuple);
 
-	/*
-	 * If the caller fails to make the metacxt different from the tablecxt,
-	 * allowing JIT would lead to the generated functions to a) live longer
-	 * than the query or b) be re-generated each time the table is being
-	 * reset. Therefore prevent JIT from being used in that case, by not
-	 * providing a parent node (which prevents accessing the JitContext in the
-	 * EState).
-	 */
-	allow_jit = (metacxt != tablecxt);
-
 	/* build hash ExprState for all columns */
 	hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
 														inputOps,
 														hashfunctions,
 														collations,
 														numCols,
 														keyColIdx,
-														allow_jit ? parent : NULL,
+														parent,
 														hash_iv);
 
 	/* build comparator for all columns */
@@ -256,7 +270,7 @@ BuildTupleHashTable(PlanState *parent,
 													&TTSOpsMinimalTuple,
 													numCols,
 													keyColIdx, eqfuncoids, collations,
-													allow_jit ? parent : NULL);
+													parent);
 
 	/*
 	 * While not pretty, it's ok to not shut down this context, but instead
@@ -273,13 +287,19 @@ BuildTupleHashTable(PlanState *parent,
 
 /*
  * Reset contents of the hashtable to be empty, preserving all the non-content
- * state. Note that the tablecxt passed to BuildTupleHashTable() should
- * also be reset, otherwise there will be leaks.
+ * state.
+ *
+ * Note: in usages where several TupleHashTables share a tuplescxt, all must
+ * be reset together, as the first one's reset call will destroy all their
+ * data.  The additional reset calls for the rest will redundantly reset the
+ * tuplescxt.  But because of mcxt.c's isReset flag, that's cheap enough that
+ * we need not avoid it.
  */
 void
 ResetTupleHashTable(TupleHashTable hashtable)
 {
 	tuplehash_reset(hashtable->hashtab);
+	MemoryContextReset(hashtable->tuplescxt);
 }
 
 /*
@@ -489,10 +509,10 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 			/* created new entry */
 			*isnew = true;
 
-			MemoryContextSwitchTo(hashtable->tablecxt);
+			MemoryContextSwitchTo(hashtable->tuplescxt);
 
 			/*
-			 * Copy the first tuple into the table context, and request
+			 * Copy the first tuple into the tuples context, and request
 			 * additionalsize extra bytes before the allocation.
 			 *
 			 * The caller can get a pointer to the additional data with
 
@@ -1457,7 +1457,7 @@ find_cols_walker(Node *node, FindColsContext *context)
  * We have a separate hashtable and associated perhash data structure for each
  * grouping set for which we're doing hashing.
  *
- * The contents of the hash tables always live in the hashcontext's per-tuple
+ * The contents of the hash tables live in the aggstate's hash_tuplescxt
  * memory context (there is only one of these for all tables together, since
  * they are all reset at the same time).
  */
@@ -1509,7 +1509,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 {
 	AggStatePerHash perhash = &aggstate->perhash[setno];
 	MemoryContext metacxt = aggstate->hash_metacxt;
-	MemoryContext tablecxt = aggstate->hash_tablecxt;
+	MemoryContext tuplescxt = aggstate->hash_tuplescxt;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
 
@@ -1535,7 +1535,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 											 nbuckets,
 											 additionalsize,
 											 metacxt,
-											 tablecxt,
+											 tuplescxt,
 											 tmpcxt,
 											 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 }
@@ -1868,7 +1868,7 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt,
 													  true);
 	Size		tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
 													 true);
@@ -1959,7 +1959,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
 
 	/* memory for hash entries */
-	entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+	entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true);
 
 	/* memory for byref transition states */
 	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
@@ -2042,11 +2042,11 @@ hash_create_memory(AggState *aggstate)
 	/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
 	maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
 
-	aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
-												"HashAgg table context",
-												ALLOCSET_DEFAULT_MINSIZE,
-												ALLOCSET_DEFAULT_INITSIZE,
-												maxBlockSize);
+	aggstate->hash_tuplescxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												 "HashAgg hashed tuples",
+												 ALLOCSET_DEFAULT_MINSIZE,
+												 ALLOCSET_DEFAULT_INITSIZE,
+												 maxBlockSize);
 
 }
 
@@ -2707,7 +2707,6 @@ agg_refill_hash_table(AggState *aggstate)
 
 	/* free memory and reset hash tables */
 	ReScanExprContext(aggstate->hashcontext);
-	MemoryContextReset(aggstate->hash_tablecxt);
 	for (int setno = 0; setno < aggstate->num_hashes; setno++)
 		ResetTupleHashTable(aggstate->perhash[setno].hashtable);
 
@@ -4428,18 +4427,18 @@ ExecEndAgg(AggState *node)
 
 	hashagg_reset_spill_state(node);
 
+	/* Release hash tables too */
 	if (node->hash_metacxt != NULL)
 	{
 		MemoryContextDelete(node->hash_metacxt);
 		node->hash_metacxt = NULL;
 	}
-	if (node->hash_tablecxt != NULL)
+	if (node->hash_tuplescxt != NULL)
 	{
-		MemoryContextDelete(node->hash_tablecxt);
-		node->hash_tablecxt = NULL;
+		MemoryContextDelete(node->hash_tuplescxt);
+		node->hash_tuplescxt = NULL;
 	}
 
-
 	for (transno = 0; transno < node->numtrans; transno++)
 	{
 		AggStatePerTrans pertrans = &node->pertrans[transno];
@@ -4555,8 +4554,7 @@ ExecReScanAgg(AggState *node)
 		node->hash_ngroups_current = 0;
 
 		ReScanExprContext(node->hashcontext);
-		MemoryContextReset(node->hash_tablecxt);
-		/* Rebuild an empty hash table */
+		/* Rebuild empty hash table(s) */
 		build_hash_tables(node);
 		node->table_filled = false;
 		/* iterator will be reset when the table is filled */
 
@@ -53,7 +53,7 @@ build_hash_table(RecursiveUnionState *rustate)
 											 node->numGroups,
 											 0,
 											 rustate->ps.state->es_query_cxt,
-											 rustate->tableContext,
+											 rustate->tuplesContext,
 											 rustate->tempContext,
 											 false);
 }
@@ -197,7 +197,7 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags)
 	rustate->hashfunctions = NULL;
 	rustate->hashtable = NULL;
 	rustate->tempContext = NULL;
-	rustate->tableContext = NULL;
+	rustate->tuplesContext = NULL;
 
 	/* initialize processing state */
 	rustate->recursing = false;
@@ -209,18 +209,19 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags)
 	 * If hashing, we need a per-tuple memory context for comparisons, and a
 	 * longer-lived context to store the hash table.  The table can't just be
 	 * kept in the per-query context because we want to be able to throw it
-	 * away when rescanning.
+	 * away when rescanning.  We can use a BumpContext to save storage,
+	 * because we will have no need to delete individual table entries.
 	 */
 	if (node->numCols > 0)
 	{
 		rustate->tempContext =
 			AllocSetContextCreate(CurrentMemoryContext,
 								  "RecursiveUnion",
 								  ALLOCSET_DEFAULT_SIZES);
-		rustate->tableContext =
-			AllocSetContextCreate(CurrentMemoryContext,
-								  "RecursiveUnion hash table",
-								  ALLOCSET_DEFAULT_SIZES);
+		rustate->tuplesContext =
+			BumpContextCreate(CurrentMemoryContext,
+							  "RecursiveUnion hashed tuples",
+							  ALLOCSET_DEFAULT_SIZES);
 	}
 
 	/*
@@ -288,11 +289,11 @@ ExecEndRecursiveUnion(RecursiveUnionState *node)
 	tuplestore_end(node->working_table);
 	tuplestore_end(node->intermediate_table);
 
-	/* free subsidiary stuff including hashtable */
+	/* free subsidiary stuff including hashtable data */
 	if (node->tempContext)
 		MemoryContextDelete(node->tempContext);
-	if (node->tableContext)
-		MemoryContextDelete(node->tableContext);
+	if (node->tuplesContext)
+		MemoryContextDelete(node->tuplesContext);
 
 	/*
 	 * close down subplans
@@ -328,10 +329,6 @@ ExecReScanRecursiveUnion(RecursiveUnionState *node)
 	if (outerPlan->chgParam == NULL)
 		ExecReScan(outerPlan);
 
-	/* Release any hashtable storage */
-	if (node->tableContext)
-		MemoryContextReset(node->tableContext);
-
 	/* Empty hashtable if needed */
 	if (plan->numCols > 0)
 		ResetTupleHashTable(node->hashtable);
 
@@ -106,7 +106,7 @@ build_hash_table(SetOpState *setopstate)
 												node->numGroups,
 												sizeof(SetOpStatePerGroupData),
 												setopstate->ps.state->es_query_cxt,
-												setopstate->tableContext,
+												setopstate->tuplesContext,
 												econtext->ecxt_per_tuple_memory,
 												false);
 }
@@ -589,13 +589,15 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	/*
 	 * If hashing, we also need a longer-lived context to store the hash
 	 * table.  The table can't just be kept in the per-query context because
-	 * we want to be able to throw it away in ExecReScanSetOp.
+	 * we want to be able to throw it away in ExecReScanSetOp.  We can use a
+	 * BumpContext to save storage, because we will have no need to delete
+	 * individual table entries.
 	 */
 	if (node->strategy == SETOP_HASHED)
-		setopstate->tableContext =
-			AllocSetContextCreate(CurrentMemoryContext,
-								  "SetOp hash table",
-								  ALLOCSET_DEFAULT_SIZES);
+		setopstate->tuplesContext =
+			BumpContextCreate(CurrentMemoryContext,
+							  "SetOp hashed tuples",
+							  ALLOCSET_DEFAULT_SIZES);
 
 	/*
 	 * initialize child nodes
@@ -680,9 +682,9 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 void
 ExecEndSetOp(SetOpState *node)
 {
-	/* free subsidiary stuff including hashtable */
-	if (node->tableContext)
-		MemoryContextDelete(node->tableContext);
+	/* free subsidiary stuff including hashtable data */
+	if (node->tuplesContext)
+		MemoryContextDelete(node->tuplesContext);
 
 	ExecEndNode(outerPlanState(node));
 	ExecEndNode(innerPlanState(node));
@@ -721,11 +723,7 @@ ExecReScanSetOp(SetOpState *node)
 			return;
 		}
 
-		/* Release any hashtable storage */
-		if (node->tableContext)
-			MemoryContextReset(node->tableContext);
-
-		/* And rebuild an empty hashtable */
+		/* Else, we must rebuild the hashtable */
 		ResetTupleHashTable(node->hashtable);
 		node->table_filled = false;
 	}