Skip to content

Commit bc06441

Browse files
author
Commitfest Bot
committed
[CF 5805] v2 - Improve array-element-test estimation when no array elements qualify as common
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5805 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/[email protected] Author(s): Tom Lane
2 parents 0951942 + 184474c commit bc06441

File tree

7 files changed

+67
-30
lines changed

7 files changed

+67
-30
lines changed

contrib/intarray/_int_selfuncs.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,8 @@ _int_matchsel(PG_FUNCTION_ARGS)
210210
*/
211211
if (sslot.nnumbers == sslot.nvalues + 3)
212212
{
213-
/* Grab the lowest frequency. */
214-
minfreq = sslot.numbers[sslot.nnumbers - (sslot.nnumbers - sslot.nvalues)];
213+
/* Grab the minimal MCE frequency. */
214+
minfreq = sslot.numbers[sslot.nvalues];
215215

216216
mcelems = sslot.values;
217217
mcefreqs = sslot.numbers;
@@ -270,9 +270,9 @@ int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
270270
{
271271
/*
272272
* The element is not in MCELEM. Punt, but assume that the
273-
* selectivity cannot be more than minfreq / 2.
273+
* selectivity cannot be more than minfreq.
274274
*/
275-
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
275+
selec = Min(DEFAULT_EQ_SEL, minfreq);
276276
}
277277
}
278278
else if (item->type == OPR)

src/backend/commands/analyze.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1711,10 +1711,9 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
17111711
i = Anum_pg_statistic_stanumbers1 - 1;
17121712
for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
17131713
{
1714-
int nnum = stats->numnumbers[k];
1715-
1716-
if (nnum > 0)
1714+
if (stats->stanumbers[k] != NULL)
17171715
{
1716+
int nnum = stats->numnumbers[k];
17181717
Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
17191718
ArrayType *arry;
17201719

@@ -1732,7 +1731,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
17321731
i = Anum_pg_statistic_stavalues1 - 1;
17331732
for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
17341733
{
1735-
if (stats->numvalues[k] > 0)
1734+
if (stats->stavalues[k] != NULL)
17361735
{
17371736
ArrayType *arry;
17381737

src/backend/tsearch/ts_selfuncs.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,8 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
239239
}
240240

241241
/*
242-
* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
243-
* the one before the last cell of the Numbers array. See ts_typanalyze.c
242+
* Grab the lowest MCE frequency. compute_tsvector_stats() stored it for
243+
* us in the one before the last cell of the Numbers array.
244244
*/
245245
minfreq = numbers[nnumbers - 2];
246246

@@ -348,7 +348,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
348348
* preserves the property that "word:*" should be estimated to
349349
* match at least as many rows as "word" would be.
350350
*/
351-
selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
351+
selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq), selec);
352352
}
353353
else
354354
{
@@ -375,9 +375,9 @@ tsquery_opr_selec(QueryItem *item, char *operand,
375375
{
376376
/*
377377
* The element is not in MCELEM. Punt, but assume that the
378-
* selectivity cannot be more than minfreq / 2.
378+
* selectivity cannot be more than minfreq.
379379
*/
380-
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
380+
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq);
381381
}
382382
}
383383
}

src/backend/tsearch/ts_typanalyze.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ compute_tsvector_stats(VacAttrStats *stats,
312312
/*
313313
* Construct an array of the interesting hashtable items, that is,
314314
* those meeting the cutoff frequency (s - epsilon)*N. Also identify
315-
* the minimum and maximum frequencies among these items.
315+
* the maximum frequency among these items.
316316
*
317317
* Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
318318
* frequency is 9*N / bucket_width.
@@ -324,14 +324,12 @@ compute_tsvector_stats(VacAttrStats *stats,
324324

325325
hash_seq_init(&scan_status, lexemes_tab);
326326
track_len = 0;
327-
minfreq = lexeme_no;
328327
maxfreq = 0;
329328
while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
330329
{
331330
if (item->frequency > cutoff_freq)
332331
{
333332
sort_table[track_len++] = item;
334-
minfreq = Min(minfreq, item->frequency);
335333
maxfreq = Max(maxfreq, item->frequency);
336334
}
337335
}
@@ -346,19 +344,38 @@ compute_tsvector_stats(VacAttrStats *stats,
346344
* If we obtained more lexemes than we really want, get rid of those
347345
* with least frequencies. The easiest way is to qsort the array into
348346
* descending frequency order and truncate the array.
347+
*
348+
* If we did not find more elements than we want, then it is safe to
349+
* assume that the stored MCE array will contain every element with
350+
* frequency above the cutoff. In that case, rather than storing the
351+
* smallest frequency we are keeping, we want to store the minimum
352+
* frequency that would have been accepted as a valid MCE. The
353+
* selectivity functions can assume that that is an upper bound on the
354+
* frequency of elements not present in the array.
355+
*
356+
* If we found no candidate MCEs at all, we still want to record the
357+
* cutoff frequency, since it's still valid to assume that no element
358+
* has frequency more than that.
349359
*/
350360
if (num_mcelem < track_len)
351361
{
352362
qsort_interruptible(sort_table, track_len, sizeof(TrackItem *),
353363
trackitem_compare_frequencies_desc, NULL);
354-
/* reset minfreq to the smallest frequency we're keeping */
364+
/* set minfreq to the smallest frequency we're keeping */
355365
minfreq = sort_table[num_mcelem - 1]->frequency;
356366
}
357367
else
368+
{
358369
num_mcelem = track_len;
370+
/* set minfreq to the minimum frequency above the cutoff */
371+
minfreq = cutoff_freq + 1;
372+
/* ensure maxfreq is nonzero, too */
373+
if (track_len == 0)
374+
maxfreq = minfreq;
375+
}
359376

360377
/* Generate MCELEM slot entry */
361-
if (num_mcelem > 0)
378+
if (num_mcelem >= 0)
362379
{
363380
MemoryContext old_context;
364381
Datum *mcelem_values;

src/backend/utils/adt/array_selfuncs.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -544,13 +544,13 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
544544

545545
if (numbers)
546546
{
547-
/* Grab the lowest observed frequency */
547+
/* Grab the minimal MCE frequency */
548548
minfreq = numbers[nmcelem];
549549
}
550550
else
551551
{
552552
/* Without statistics make some default assumptions */
553-
minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
553+
minfreq = (float4) DEFAULT_CONTAIN_SEL;
554554
}
555555

556556
/* Decide whether it is faster to use binary search or not. */
@@ -622,9 +622,9 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
622622
{
623623
/*
624624
* The element is not in MCELEM. Punt, but assume that the
625-
* selectivity cannot be more than minfreq / 2.
625+
* selectivity cannot be more than minfreq.
626626
*/
627-
elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
627+
elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq);
628628
}
629629

630630
/*
@@ -728,7 +728,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
728728

729729
/*
730730
* Grab some of the summary statistics that compute_array_stats() stores:
731-
* lowest frequency, frequency of null elements, and average distinct
731+
* lowest MCE frequency, frequency of null elements, and average distinct
732732
* element count.
733733
*/
734734
minfreq = numbers[nmcelem];
@@ -803,10 +803,10 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
803803
{
804804
/*
805805
* The element is not in MCELEM. Punt, but assume that the
806-
* selectivity cannot be more than minfreq / 2.
806+
* selectivity cannot be more than minfreq.
807807
*/
808808
elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
809-
minfreq / 2);
809+
minfreq);
810810
}
811811

812812
unique_nitems++;

src/backend/utils/adt/array_typanalyze.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
461461
/*
462462
* Construct an array of the interesting hashtable items, that is,
463463
* those meeting the cutoff frequency (s - epsilon)*N. Also identify
464-
* the minimum and maximum frequencies among these items.
464+
* the maximum frequency among these items.
465465
*
466466
* Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
467467
* frequency is 9*N / bucket_width.
@@ -473,14 +473,12 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
473473

474474
hash_seq_init(&scan_status, elements_tab);
475475
track_len = 0;
476-
minfreq = element_no;
477476
maxfreq = 0;
478477
while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
479478
{
480479
if (item->frequency > cutoff_freq)
481480
{
482481
sort_table[track_len++] = item;
483-
minfreq = Min(minfreq, item->frequency);
484482
maxfreq = Max(maxfreq, item->frequency);
485483
}
486484
}
@@ -497,19 +495,38 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
497495
* If we obtained more elements than we really want, get rid of those
498496
* with least frequencies. The easiest way is to qsort the array into
499497
* descending frequency order and truncate the array.
498+
*
499+
* If we did not find more elements than we want, then it is safe to
500+
* assume that the stored MCE array will contain every element with
501+
* frequency above the cutoff. In that case, rather than storing the
502+
* smallest frequency we are keeping, we want to store the minimum
503+
* frequency that would have been accepted as a valid MCE. The
504+
* selectivity functions can assume that that is an upper bound on the
505+
* frequency of elements not present in the array.
506+
*
507+
* If we found no candidate MCEs at all, we still want to record the
508+
* cutoff frequency, since it's still valid to assume that no element
509+
* has frequency more than that.
500510
*/
501511
if (num_mcelem < track_len)
502512
{
503513
qsort_interruptible(sort_table, track_len, sizeof(TrackItem *),
504514
trackitem_compare_frequencies_desc, NULL);
505-
/* reset minfreq to the smallest frequency we're keeping */
515+
/* set minfreq to the smallest frequency we're keeping */
506516
minfreq = sort_table[num_mcelem - 1]->frequency;
507517
}
508518
else
519+
{
509520
num_mcelem = track_len;
521+
/* set minfreq to the minimum frequency above the cutoff */
522+
minfreq = cutoff_freq + 1;
523+
/* ensure maxfreq is nonzero, too */
524+
if (track_len == 0)
525+
maxfreq = minfreq;
526+
}
510527

511528
/* Generate MCELEM slot entry */
512-
if (num_mcelem > 0)
529+
if (num_mcelem >= 0)
513530
{
514531
MemoryContext old_context;
515532
Datum *mcelem_values;

src/include/catalog/pg_statistic.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,10 @@ DECLARE_FOREIGN_KEY((starelid, staattnum), pg_attribute, (attrelid, attnum));
240240
* the fraction of non-null rows that contain at least one null element). If
241241
* this member is omitted, the column is presumed to contain no null elements.
242242
*
243+
* Starting in v19, the first extra member can be smaller than the smallest
244+
* frequency of any stored MCE, indicating that it's known that no element
245+
* not present in the MCE array has frequency greater than that value.
246+
*
243247
* Note: in current usage for tsvector columns, the stavalues elements are of
244248
* type text, even though their representation within tsvector is not
245249
* exactly text.

0 commit comments

Comments
 (0)