Skip to content

Commit d3c1208

Browse files
author
Commitfest Bot
committed
[CF 5636] v2 - Memoize ANTI and SEMI JOIN inner
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5636 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/CAMbWs48g0Q_mWJip_dDEcgaCHZw71MRB=pkz4B6PzWyfVWWaCA@mail.gmail.com Author(s): Andrei Lepikhov
2 parents a6a4641 + 8fc5c2d commit d3c1208

File tree

3 files changed

+112
-22
lines changed

3 files changed

+112
-22
lines changed

src/backend/optimizer/path/joinpath.c

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root,
154154
/*
155155
* See if the inner relation is provably unique for this outer rel.
156156
*
157-
* We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
158-
* matter since the executor can make the equivalent optimization anyway;
159-
* we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we
160-
* must be considering a semijoin whose inner side is not provably unique
161-
* (else reduce_unique_semijoins would've simplified it), so there's no
162-
* point in calling innerrel_is_unique. However, if the LHS covers all of
163-
* the semijoin's min_lefthand, then it's appropriate to set inner_unique
157+
* We have some special cases: for JOIN_SEMI, it doesn't matter since the
158+
* executor can make the equivalent optimization anyway. It also doesn't
159+
* help enable use of Memoize, since a semijoin with a provably unique
160+
* inner side should have been reduced to an inner join in that case.
161+
* Therefore, we need not expend planner cycles on proofs. (For
162+
* JOIN_ANTI, although it doesn't help the executor for the same reason,
163+
* it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be
164+
* considering a semijoin whose inner side is not provably unique (else
165+
* reduce_unique_semijoins would've simplified it), so there's no point in
166+
* calling innerrel_is_unique. However, if the LHS covers all of the
167+
* semijoin's min_lefthand, then it's appropriate to set inner_unique
164168
* because the path produced by create_unique_path will be unique relative
165169
* to the LHS. (If we have an LHS that's only part of the min_lefthand,
166170
* that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
@@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root,
169173
switch (jointype)
170174
{
171175
case JOIN_SEMI:
172-
case JOIN_ANTI:
173-
174-
/*
175-
* XXX it may be worth proving this to allow a Memoize to be
176-
* considered for Nested Loop Semi/Anti Joins.
177-
*/
178176
extra.inner_unique = false; /* well, unproven */
179177
break;
180178
case JOIN_UNIQUE_INNER:
@@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
715713
return NULL;
716714

717715
/*
718-
* Currently we don't do this for SEMI and ANTI joins unless they're
719-
* marked as inner_unique. This is because nested loop SEMI/ANTI joins
720-
* don't scan the inner node to completion, which will mean memoize cannot
721-
* mark the cache entry as complete.
722-
*
723-
* XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique
724-
* = true. Should we? See add_paths_to_joinrel()
716+
* Currently we don't do this for SEMI and ANTI joins, because nested loop
717+
* SEMI/ANTI joins don't scan the inner node to completion, which means
718+
* memoize cannot mark the cache entry as complete. Nor can we mark the
719+
* cache entry as complete after fetching the first inner tuple, because
720+
* if that tuple and the current outer tuple don't satisfy the join
721+
* clauses, a second inner tuple that satisfies the parameters would find
722+
* the cache entry already marked as complete. The only exception is when
723+
* the inner relation is provably unique, as in that case, there won't be
724+
* a second matching tuple and we can safely mark the cache entry as
725+
* complete after fetching the first inner tuple. Note that in such
726+
* cases, the SEMI join should have been reduced to an inner join by
727+
* reduce_unique_semijoins.
725728
*/
726-
if (!extra->inner_unique && (jointype == JOIN_SEMI ||
727-
jointype == JOIN_ANTI))
729+
if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
730+
!extra->inner_unique)
728731
return NULL;
729732

730733
/*

src/test/regress/expected/memoize.out

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ begin
2525
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
2626
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
2727
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
28+
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
2829
return next ln;
2930
end loop;
3031
end;
@@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather;
500501
RESET parallel_tuple_cost;
501502
RESET parallel_setup_cost;
502503
RESET min_parallel_table_scan_size;
504+
-- Ensure memoize works for ANTI joins
505+
CREATE TABLE tab_anti (a int, b boolean);
506+
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
507+
ANALYZE tab_anti;
508+
-- Ensure we get a Memoize plan for ANTI join
509+
SELECT explain_memoize('
510+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
511+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
512+
ON t1.a+1 = t2.a
513+
WHERE t2.a IS NULL;', false);
514+
explain_memoize
515+
--------------------------------------------------------------------------------------------
516+
Aggregate (actual rows=1.00 loops=N)
517+
-> Nested Loop Anti Join (actual rows=33.00 loops=N)
518+
-> Seq Scan on tab_anti t1 (actual rows=100.00 loops=N)
519+
-> Memoize (actual rows=0.67 loops=N)
520+
Cache Key: (t1.a + 1), t1.a
521+
Cache Mode: binary
522+
Hits: 97 Misses: 3 Evictions: Zero Overflows: 0 Memory Usage: NkB
523+
-> Subquery Scan on t2 (actual rows=0.67 loops=N)
524+
Filter: ((t1.a + 1) = t2.a)
525+
Rows Removed by Filter: 2
526+
-> Unique (actual rows=2.67 loops=N)
527+
-> Sort (actual rows=67.33 loops=N)
528+
Sort Key: t2_1.a
529+
Sort Method: quicksort Memory: NkB
530+
-> Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N)
531+
(15 rows)
532+
533+
-- And check we get the expected results.
534+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
535+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
536+
ON t1.a+1 = t2.a
537+
WHERE t2.a IS NULL;
538+
count
539+
-------
540+
33
541+
(1 row)
542+
543+
-- Ensure we do not add memoize node for SEMI join
544+
EXPLAIN (COSTS OFF)
545+
SELECT * FROM tab_anti t1 WHERE t1.a IN
546+
(SELECT a FROM tab_anti t2 WHERE t2.b IN
547+
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
548+
QUERY PLAN
549+
-------------------------------------------------
550+
Nested Loop Semi Join
551+
-> Seq Scan on tab_anti t1
552+
-> Nested Loop Semi Join
553+
Join Filter: (t1.a = t2.a)
554+
-> Seq Scan on tab_anti t2
555+
-> Subquery Scan on "ANY_subquery"
556+
Filter: (t2.b = "ANY_subquery".b)
557+
-> Result
558+
One-Time Filter: (t2.a > 1)
559+
-> Seq Scan on tab_anti t3
560+
(10 rows)
561+
562+
DROP TABLE tab_anti;

src/test/regress/sql/memoize.sql

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ begin
2626
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
2727
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
2828
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
29+
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
2930
return next ln;
3031
end loop;
3132
end;
@@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather;
244245
RESET parallel_tuple_cost;
245246
RESET parallel_setup_cost;
246247
RESET min_parallel_table_scan_size;
248+
249+
-- Ensure memoize works for ANTI joins
250+
CREATE TABLE tab_anti (a int, b boolean);
251+
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
252+
ANALYZE tab_anti;
253+
254+
-- Ensure we get a Memoize plan for ANTI join
255+
SELECT explain_memoize('
256+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
257+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
258+
ON t1.a+1 = t2.a
259+
WHERE t2.a IS NULL;', false);
260+
261+
-- And check we get the expected results.
262+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
263+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
264+
ON t1.a+1 = t2.a
265+
WHERE t2.a IS NULL;
266+
267+
-- Ensure we do not add memoize node for SEMI join
268+
EXPLAIN (COSTS OFF)
269+
SELECT * FROM tab_anti t1 WHERE t1.a IN
270+
(SELECT a FROM tab_anti t2 WHERE t2.b IN
271+
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
272+
273+
DROP TABLE tab_anti;

0 commit comments

Comments
 (0)